1 /* 2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2019, SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/collectedHeap.inline.hpp" 30 #include "gc/shared/barrierSet.hpp" 31 #include "gc/shared/barrierSetAssembler.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "memory/resourceArea.hpp" 34 #include "nativeInst_ppc.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/biasedLocking.hpp" 37 #include "runtime/icache.hpp" 38 #include "runtime/interfaceSupport.inline.hpp" 39 #include "runtime/objectMonitor.hpp" 40 #include "runtime/os.hpp" 41 #include "runtime/safepoint.hpp" 42 #include "runtime/safepointMechanism.hpp" 43 #include "runtime/sharedRuntime.hpp" 44 #include "runtime/stubRoutines.hpp" 45 #include "utilities/macros.hpp" 46 #ifdef COMPILER2 47 #include "opto/intrinsicnode.hpp" 48 #endif 49 50 #ifdef PRODUCT 51 #define BLOCK_COMMENT(str) // nothing 52 #else 53 #define BLOCK_COMMENT(str) block_comment(str) 54 #endif 55 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 56 57 #ifdef ASSERT 58 // On RISC, there's no benefit to verifying instruction boundaries. 59 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 60 #endif 61 62 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 63 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 64 if (Assembler::is_simm(si31, 16)) { 65 ld(d, si31, a); 66 if (emit_filler_nop) nop(); 67 } else { 68 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 69 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 70 addis(d, a, hi); 71 ld(d, lo, d); 72 } 73 } 74 75 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 76 assert_different_registers(d, a); 77 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 78 } 79 80 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 81 size_t size_in_bytes, bool is_signed) { 82 switch (size_in_bytes) { 83 case 8: ld(dst, offs, base); break; 84 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 85 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 86 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 87 default: ShouldNotReachHere(); 88 } 89 } 90 91 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 92 size_t size_in_bytes) { 93 switch (size_in_bytes) { 94 case 8: std(dst, offs, base); break; 95 case 4: stw(dst, offs, base); break; 96 case 2: sth(dst, offs, base); break; 97 case 1: stb(dst, offs, base); break; 98 default: ShouldNotReachHere(); 99 } 100 } 101 102 void MacroAssembler::align(int modulus, int max, int rem) { 103 int padding = (rem + modulus - (offset() % modulus)) % modulus; 104 if (padding > max) return; 105 for (int c = (padding >> 2); c > 0; --c) { nop(); } 106 } 107 108 // Issue instructions that calculate given TOC from global TOC. 109 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 110 bool add_relocation, bool emit_dummy_addr) { 111 int offset = -1; 112 if (emit_dummy_addr) { 113 offset = -128; // dummy address 114 } else if (addr != (address)(intptr_t)-1) { 115 offset = MacroAssembler::offset_to_global_toc(addr); 116 } 117 118 if (hi16) { 119 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 120 } 121 if (lo16) { 122 if (add_relocation) { 123 // Relocate at the addi to avoid confusion with a load from the method's TOC. 124 relocate(internal_word_Relocation::spec(addr)); 125 } 126 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 127 } 128 } 129 130 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 131 const int offset = MacroAssembler::offset_to_global_toc(addr); 132 133 const address inst2_addr = a; 134 const int inst2 = *(int *)inst2_addr; 135 136 // The relocation points to the second instruction, the addi, 137 // and the addi reads and writes the same register dst. 138 const int dst = inv_rt_field(inst2); 139 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 140 141 // Now, find the preceding addis which writes to dst. 142 int inst1 = 0; 143 address inst1_addr = inst2_addr - BytesPerInstWord; 144 while (inst1_addr >= bound) { 145 inst1 = *(int *) inst1_addr; 146 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 147 // Stop, found the addis which writes dst. 148 break; 149 } 150 inst1_addr -= BytesPerInstWord; 151 } 152 153 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 154 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 155 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 156 return inst1_addr; 157 } 158 159 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 160 const address inst2_addr = a; 161 const int inst2 = *(int *)inst2_addr; 162 163 // The relocation points to the second instruction, the addi, 164 // and the addi reads and writes the same register dst. 165 const int dst = inv_rt_field(inst2); 166 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 167 168 // Now, find the preceding addis which writes to dst. 169 int inst1 = 0; 170 address inst1_addr = inst2_addr - BytesPerInstWord; 171 while (inst1_addr >= bound) { 172 inst1 = *(int *) inst1_addr; 173 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 174 // stop, found the addis which writes dst 175 break; 176 } 177 inst1_addr -= BytesPerInstWord; 178 } 179 180 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 181 182 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 183 // -1 is a special case 184 if (offset == -1) { 185 return (address)(intptr_t)-1; 186 } else { 187 return global_toc() + offset; 188 } 189 } 190 191 #ifdef _LP64 192 // Patch compressed oops or klass constants. 193 // Assembler sequence is 194 // 1) compressed oops: 195 // lis rx = const.hi 196 // ori rx = rx | const.lo 197 // 2) compressed klass: 198 // lis rx = const.hi 199 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 200 // ori rx = rx | const.lo 201 // Clrldi will be passed by. 202 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 203 assert(UseCompressedOops, "Should only patch compressed oops"); 204 205 const address inst2_addr = a; 206 const int inst2 = *(int *)inst2_addr; 207 208 // The relocation points to the second instruction, the ori, 209 // and the ori reads and writes the same register dst. 210 const int dst = inv_rta_field(inst2); 211 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 212 // Now, find the preceding addis which writes to dst. 213 int inst1 = 0; 214 address inst1_addr = inst2_addr - BytesPerInstWord; 215 bool inst1_found = false; 216 while (inst1_addr >= bound) { 217 inst1 = *(int *)inst1_addr; 218 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 219 inst1_addr -= BytesPerInstWord; 220 } 221 assert(inst1_found, "inst is not lis"); 222 223 int xc = (data >> 16) & 0xffff; 224 int xd = (data >> 0) & 0xffff; 225 226 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 227 set_imm((int *)inst2_addr, (xd)); // unsigned int 228 return inst1_addr; 229 } 230 231 // Get compressed oop or klass constant. 232 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 233 assert(UseCompressedOops, "Should only patch compressed oops"); 234 235 const address inst2_addr = a; 236 const int inst2 = *(int *)inst2_addr; 237 238 // The relocation points to the second instruction, the ori, 239 // and the ori reads and writes the same register dst. 240 const int dst = inv_rta_field(inst2); 241 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 242 // Now, find the preceding lis which writes to dst. 243 int inst1 = 0; 244 address inst1_addr = inst2_addr - BytesPerInstWord; 245 bool inst1_found = false; 246 247 while (inst1_addr >= bound) { 248 inst1 = *(int *) inst1_addr; 249 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 250 inst1_addr -= BytesPerInstWord; 251 } 252 assert(inst1_found, "inst is not lis"); 253 254 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 255 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 256 257 return (int) (xl | xh); 258 } 259 #endif // _LP64 260 261 // Returns true if successful. 262 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 263 Register toc, bool fixed_size) { 264 int toc_offset = 0; 265 // Use RelocationHolder::none for the constant pool entry, otherwise 266 // we will end up with a failing NativeCall::verify(x) where x is 267 // the address of the constant pool entry. 268 // FIXME: We should insert relocation information for oops at the constant 269 // pool entries instead of inserting it at the loads; patching of a constant 270 // pool entry should be less expensive. 271 address const_address = address_constant((address)a.value(), RelocationHolder::none); 272 if (const_address == NULL) { return false; } // allocation failure 273 // Relocate at the pc of the load. 274 relocate(a.rspec()); 275 toc_offset = (int)(const_address - code()->consts()->start()); 276 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 277 return true; 278 } 279 280 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 281 const address inst1_addr = a; 282 const int inst1 = *(int *)inst1_addr; 283 284 // The relocation points to the ld or the addis. 285 return (is_ld(inst1)) || 286 (is_addis(inst1) && inv_ra_field(inst1) != 0); 287 } 288 289 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 290 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 291 292 const address inst1_addr = a; 293 const int inst1 = *(int *)inst1_addr; 294 295 if (is_ld(inst1)) { 296 return inv_d1_field(inst1); 297 } else if (is_addis(inst1)) { 298 const int dst = inv_rt_field(inst1); 299 300 // Now, find the succeeding ld which reads and writes to dst. 301 address inst2_addr = inst1_addr + BytesPerInstWord; 302 int inst2 = 0; 303 while (true) { 304 inst2 = *(int *) inst2_addr; 305 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 306 // Stop, found the ld which reads and writes dst. 307 break; 308 } 309 inst2_addr += BytesPerInstWord; 310 } 311 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 312 } 313 ShouldNotReachHere(); 314 return 0; 315 } 316 317 // Get the constant from a `load_const' sequence. 318 long MacroAssembler::get_const(address a) { 319 assert(is_load_const_at(a), "not a load of a constant"); 320 const int *p = (const int*) a; 321 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 322 if (is_ori(*(p+1))) { 323 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 324 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 325 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 326 } else if (is_lis(*(p+1))) { 327 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 328 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 329 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 330 } else { 331 ShouldNotReachHere(); 332 return (long) 0; 333 } 334 return (long) x; 335 } 336 337 // Patch the 64 bit constant of a `load_const' sequence. This is a low 338 // level procedure. It neither flushes the instruction cache nor is it 339 // mt safe. 340 void MacroAssembler::patch_const(address a, long x) { 341 assert(is_load_const_at(a), "not a load of a constant"); 342 int *p = (int*) a; 343 if (is_ori(*(p+1))) { 344 set_imm(0 + p, (x >> 48) & 0xffff); 345 set_imm(1 + p, (x >> 32) & 0xffff); 346 set_imm(3 + p, (x >> 16) & 0xffff); 347 set_imm(4 + p, x & 0xffff); 348 } else if (is_lis(*(p+1))) { 349 set_imm(0 + p, (x >> 48) & 0xffff); 350 set_imm(2 + p, (x >> 32) & 0xffff); 351 set_imm(1 + p, (x >> 16) & 0xffff); 352 set_imm(3 + p, x & 0xffff); 353 } else { 354 ShouldNotReachHere(); 355 } 356 } 357 358 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 359 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 360 int index = oop_recorder()->allocate_metadata_index(obj); 361 RelocationHolder rspec = metadata_Relocation::spec(index); 362 return AddressLiteral((address)obj, rspec); 363 } 364 365 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 366 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 367 int index = oop_recorder()->find_index(obj); 368 RelocationHolder rspec = metadata_Relocation::spec(index); 369 return AddressLiteral((address)obj, rspec); 370 } 371 372 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 373 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 374 int oop_index = oop_recorder()->allocate_oop_index(obj); 375 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 376 } 377 378 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 379 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 380 int oop_index = oop_recorder()->find_index(obj); 381 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 382 } 383 384 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 385 Register tmp, int offset) { 386 intptr_t value = *delayed_value_addr; 387 if (value != 0) { 388 return RegisterOrConstant(value + offset); 389 } 390 391 // Load indirectly to solve generation ordering problem. 392 // static address, no relocation 393 int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true); 394 ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0) 395 396 if (offset != 0) { 397 addi(tmp, tmp, offset); 398 } 399 400 return RegisterOrConstant(tmp); 401 } 402 403 #ifndef PRODUCT 404 void MacroAssembler::pd_print_patched_instruction(address branch) { 405 Unimplemented(); // TODO: PPC port 406 } 407 #endif // ndef PRODUCT 408 409 // Conditional far branch for destinations encodable in 24+2 bits. 410 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 411 412 // If requested by flag optimize, relocate the bc_far as a 413 // runtime_call and prepare for optimizing it when the code gets 414 // relocated. 415 if (optimize == bc_far_optimize_on_relocate) { 416 relocate(relocInfo::runtime_call_type); 417 } 418 419 // variant 2: 420 // 421 // b!cxx SKIP 422 // bxx DEST 423 // SKIP: 424 // 425 426 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 427 opposite_bcond(inv_boint_bcond(boint))); 428 429 // We emit two branches. 430 // First, a conditional branch which jumps around the far branch. 431 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 432 const address bc_pc = pc(); 433 bc(opposite_boint, biint, not_taken_pc); 434 435 const int bc_instr = *(int*)bc_pc; 436 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 437 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 438 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 439 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 440 "postcondition"); 441 assert(biint == inv_bi_field(bc_instr), "postcondition"); 442 443 // Second, an unconditional far branch which jumps to dest. 444 // Note: target(dest) remembers the current pc (see CodeSection::target) 445 // and returns the current pc if the label is not bound yet; when 446 // the label gets bound, the unconditional far branch will be patched. 447 const address target_pc = target(dest); 448 const address b_pc = pc(); 449 b(target_pc); 450 451 assert(not_taken_pc == pc(), "postcondition"); 452 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 453 } 454 455 // 1 or 2 instructions 456 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 457 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 458 bc(boint, biint, dest); 459 } else { 460 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 461 } 462 } 463 464 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 465 return is_bc_far_variant1_at(instruction_addr) || 466 is_bc_far_variant2_at(instruction_addr) || 467 is_bc_far_variant3_at(instruction_addr); 468 } 469 470 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 471 if (is_bc_far_variant1_at(instruction_addr)) { 472 const address instruction_1_addr = instruction_addr; 473 const int instruction_1 = *(int*)instruction_1_addr; 474 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 475 } else if (is_bc_far_variant2_at(instruction_addr)) { 476 const address instruction_2_addr = instruction_addr + 4; 477 return bxx_destination(instruction_2_addr); 478 } else if (is_bc_far_variant3_at(instruction_addr)) { 479 return instruction_addr + 8; 480 } 481 // variant 4 ??? 482 ShouldNotReachHere(); 483 return NULL; 484 } 485 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 486 487 if (is_bc_far_variant3_at(instruction_addr)) { 488 // variant 3, far cond branch to the next instruction, already patched to nops: 489 // 490 // nop 491 // endgroup 492 // SKIP/DEST: 493 // 494 return; 495 } 496 497 // first, extract boint and biint from the current branch 498 int boint = 0; 499 int biint = 0; 500 501 ResourceMark rm; 502 const int code_size = 2 * BytesPerInstWord; 503 CodeBuffer buf(instruction_addr, code_size); 504 MacroAssembler masm(&buf); 505 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 506 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 507 masm.nop(); 508 masm.endgroup(); 509 } else { 510 if (is_bc_far_variant1_at(instruction_addr)) { 511 // variant 1, the 1st instruction contains the destination address: 512 // 513 // bcxx DEST 514 // nop 515 // 516 const int instruction_1 = *(int*)(instruction_addr); 517 boint = inv_bo_field(instruction_1); 518 biint = inv_bi_field(instruction_1); 519 } else if (is_bc_far_variant2_at(instruction_addr)) { 520 // variant 2, the 2nd instruction contains the destination address: 521 // 522 // b!cxx SKIP 523 // bxx DEST 524 // SKIP: 525 // 526 const int instruction_1 = *(int*)(instruction_addr); 527 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 528 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 529 biint = inv_bi_field(instruction_1); 530 } else { 531 // variant 4??? 532 ShouldNotReachHere(); 533 } 534 535 // second, set the new branch destination and optimize the code 536 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 537 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 538 // variant 1: 539 // 540 // bcxx DEST 541 // nop 542 // 543 masm.bc(boint, biint, dest); 544 masm.nop(); 545 } else { 546 // variant 2: 547 // 548 // b!cxx SKIP 549 // bxx DEST 550 // SKIP: 551 // 552 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 553 opposite_bcond(inv_boint_bcond(boint))); 554 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 555 masm.bc(opposite_boint, biint, not_taken_pc); 556 masm.b(dest); 557 } 558 } 559 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 560 } 561 562 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 563 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 564 // get current pc 565 uint64_t start_pc = (uint64_t) pc(); 566 567 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 568 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 569 570 // relocate here 571 if (rt != relocInfo::none) { 572 relocate(rt); 573 } 574 575 if ( ReoptimizeCallSequences && 576 (( link && is_within_range_of_b(dest, pc_of_bl)) || 577 (!link && is_within_range_of_b(dest, pc_of_b)))) { 578 // variant 2: 579 // Emit an optimized, pc-relative call/jump. 580 581 if (link) { 582 // some padding 583 nop(); 584 nop(); 585 nop(); 586 nop(); 587 nop(); 588 nop(); 589 590 // do the call 591 assert(pc() == pc_of_bl, "just checking"); 592 bl(dest, relocInfo::none); 593 } else { 594 // do the jump 595 assert(pc() == pc_of_b, "just checking"); 596 b(dest, relocInfo::none); 597 598 // some padding 599 nop(); 600 nop(); 601 nop(); 602 nop(); 603 nop(); 604 nop(); 605 } 606 607 // Assert that we can identify the emitted call/jump. 608 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 609 "can't identify emitted call"); 610 } else { 611 // variant 1: 612 mr(R0, R11); // spill R11 -> R0. 613 614 // Load the destination address into CTR, 615 // calculate destination relative to global toc. 616 calculate_address_from_global_toc(R11, dest, true, true, false); 617 618 mtctr(R11); 619 mr(R11, R0); // spill R11 <- R0. 620 nop(); 621 622 // do the call/jump 623 if (link) { 624 bctrl(); 625 } else{ 626 bctr(); 627 } 628 // Assert that we can identify the emitted call/jump. 629 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 630 "can't identify emitted call"); 631 } 632 633 // Assert that we can identify the emitted call/jump. 634 assert(is_bxx64_patchable_at((address)start_pc, link), 635 "can't identify emitted call"); 636 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 637 "wrong encoding of dest address"); 638 } 639 640 // Identify a bxx64_patchable instruction. 641 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 642 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 643 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 644 || is_bxx64_patchable_variant2_at(instruction_addr, link); 645 } 646 647 // Does the call64_patchable instruction use a pc-relative encoding of 648 // the call destination? 649 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 650 // variant 2 is pc-relative 651 return is_bxx64_patchable_variant2_at(instruction_addr, link); 652 } 653 654 // Identify variant 1. 655 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 656 unsigned int* instr = (unsigned int*) instruction_addr; 657 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 658 && is_mtctr(instr[5]) // mtctr 659 && is_load_const_at(instruction_addr); 660 } 661 662 // Identify variant 1b: load destination relative to global toc. 663 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 664 unsigned int* instr = (unsigned int*) instruction_addr; 665 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 666 && is_mtctr(instr[3]) // mtctr 667 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 668 } 669 670 // Identify variant 2. 671 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 672 unsigned int* instr = (unsigned int*) instruction_addr; 673 if (link) { 674 return is_bl (instr[6]) // bl dest is last 675 && is_nop(instr[0]) // nop 676 && is_nop(instr[1]) // nop 677 && is_nop(instr[2]) // nop 678 && is_nop(instr[3]) // nop 679 && is_nop(instr[4]) // nop 680 && is_nop(instr[5]); // nop 681 } else { 682 return is_b (instr[0]) // b dest is first 683 && is_nop(instr[1]) // nop 684 && is_nop(instr[2]) // nop 685 && is_nop(instr[3]) // nop 686 && is_nop(instr[4]) // nop 687 && is_nop(instr[5]) // nop 688 && is_nop(instr[6]); // nop 689 } 690 } 691 692 // Set dest address of a bxx64_patchable instruction. 693 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 694 ResourceMark rm; 695 int code_size = MacroAssembler::bxx64_patchable_size; 696 CodeBuffer buf(instruction_addr, code_size); 697 MacroAssembler masm(&buf); 698 masm.bxx64_patchable(dest, relocInfo::none, link); 699 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 700 } 701 702 // Get dest address of a bxx64_patchable instruction. 703 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 704 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 705 return (address) (unsigned long) get_const(instruction_addr); 706 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 707 unsigned int* instr = (unsigned int*) instruction_addr; 708 if (link) { 709 const int instr_idx = 6; // bl is last 710 int branchoffset = branch_destination(instr[instr_idx], 0); 711 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 712 } else { 713 const int instr_idx = 0; // b is first 714 int branchoffset = branch_destination(instr[instr_idx], 0); 715 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 716 } 717 // Load dest relative to global toc. 718 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 719 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 720 instruction_addr); 721 } else { 722 ShouldNotReachHere(); 723 return NULL; 724 } 725 } 726 727 // Uses ordering which corresponds to ABI: 728 // _savegpr0_14: std r14,-144(r1) 729 // _savegpr0_15: std r15,-136(r1) 730 // _savegpr0_16: std r16,-128(r1) 731 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 732 std(R14, offset, dst); offset += 8; 733 std(R15, offset, dst); offset += 8; 734 std(R16, offset, dst); offset += 8; 735 std(R17, offset, dst); offset += 8; 736 std(R18, offset, dst); offset += 8; 737 std(R19, offset, dst); offset += 8; 738 std(R20, offset, dst); offset += 8; 739 std(R21, offset, dst); offset += 8; 740 std(R22, offset, dst); offset += 8; 741 std(R23, offset, dst); offset += 8; 742 std(R24, offset, dst); offset += 8; 743 std(R25, offset, dst); offset += 8; 744 std(R26, offset, dst); offset += 8; 745 std(R27, offset, dst); offset += 8; 746 std(R28, offset, dst); offset += 8; 747 std(R29, offset, dst); offset += 8; 748 std(R30, offset, dst); offset += 8; 749 std(R31, offset, dst); offset += 8; 750 751 stfd(F14, offset, dst); offset += 8; 752 stfd(F15, offset, dst); offset += 8; 753 stfd(F16, offset, dst); offset += 8; 754 stfd(F17, offset, dst); offset += 8; 755 stfd(F18, offset, dst); offset += 8; 756 stfd(F19, offset, dst); offset += 8; 757 stfd(F20, offset, dst); offset += 8; 758 stfd(F21, offset, dst); offset += 8; 759 stfd(F22, offset, dst); offset += 8; 760 stfd(F23, offset, dst); offset += 8; 761 stfd(F24, offset, dst); offset += 8; 762 stfd(F25, offset, dst); offset += 8; 763 stfd(F26, offset, dst); offset += 8; 764 stfd(F27, offset, dst); offset += 8; 765 stfd(F28, offset, dst); offset += 8; 766 stfd(F29, offset, dst); offset += 8; 767 stfd(F30, offset, dst); offset += 8; 768 stfd(F31, offset, dst); 769 } 770 771 // Uses ordering which corresponds to ABI: 772 // _restgpr0_14: ld r14,-144(r1) 773 // _restgpr0_15: ld r15,-136(r1) 774 // _restgpr0_16: ld r16,-128(r1) 775 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 776 ld(R14, offset, src); offset += 8; 777 ld(R15, offset, src); offset += 8; 778 ld(R16, offset, src); offset += 8; 779 ld(R17, offset, src); offset += 8; 780 ld(R18, offset, src); offset += 8; 781 ld(R19, offset, src); offset += 8; 782 ld(R20, offset, src); offset += 8; 783 ld(R21, offset, src); offset += 8; 784 ld(R22, offset, src); offset += 8; 785 ld(R23, offset, src); offset += 8; 786 ld(R24, offset, src); offset += 8; 787 ld(R25, offset, src); offset += 8; 788 ld(R26, offset, src); offset += 8; 789 ld(R27, offset, src); offset += 8; 790 ld(R28, offset, src); offset += 8; 791 ld(R29, offset, src); offset += 8; 792 ld(R30, offset, src); offset += 8; 793 ld(R31, offset, src); offset += 8; 794 795 // FP registers 796 lfd(F14, offset, src); offset += 8; 797 lfd(F15, offset, src); offset += 8; 798 lfd(F16, offset, src); offset += 8; 799 lfd(F17, offset, src); offset += 8; 800 lfd(F18, offset, src); offset += 8; 801 lfd(F19, offset, src); offset += 8; 802 lfd(F20, offset, src); offset += 8; 803 lfd(F21, offset, src); offset += 8; 804 lfd(F22, offset, src); offset += 8; 805 lfd(F23, offset, src); offset += 8; 806 lfd(F24, offset, src); offset += 8; 807 lfd(F25, offset, src); offset += 8; 808 lfd(F26, offset, src); offset += 8; 809 lfd(F27, offset, src); offset += 8; 810 lfd(F28, offset, src); offset += 8; 811 lfd(F29, offset, src); offset += 8; 812 lfd(F30, offset, src); offset += 8; 813 lfd(F31, offset, src); 814 } 815 816 // For verify_oops. 817 void MacroAssembler::save_volatile_gprs(Register dst, int offset) { 818 std(R2, offset, dst); offset += 8; 819 std(R3, offset, dst); offset += 8; 820 std(R4, offset, dst); offset += 8; 821 std(R5, offset, dst); offset += 8; 822 std(R6, offset, dst); offset += 8; 823 std(R7, offset, dst); offset += 8; 824 std(R8, offset, dst); offset += 8; 825 std(R9, offset, dst); offset += 8; 826 std(R10, offset, dst); offset += 8; 827 std(R11, offset, dst); offset += 8; 828 std(R12, offset, dst); offset += 8; 829 830 stfd(F0, offset, dst); offset += 8; 831 stfd(F1, offset, dst); offset += 8; 832 stfd(F2, offset, dst); offset += 8; 833 stfd(F3, offset, dst); offset += 8; 834 stfd(F4, offset, dst); offset += 8; 835 stfd(F5, offset, dst); offset += 8; 836 stfd(F6, offset, dst); offset += 8; 837 stfd(F7, offset, dst); offset += 8; 838 stfd(F8, offset, dst); offset += 8; 839 stfd(F9, offset, dst); offset += 8; 840 stfd(F10, offset, dst); offset += 8; 841 stfd(F11, offset, dst); offset += 8; 842 stfd(F12, offset, dst); offset += 8; 843 stfd(F13, offset, dst); 844 } 845 846 // For verify_oops. 847 void MacroAssembler::restore_volatile_gprs(Register src, int offset) { 848 ld(R2, offset, src); offset += 8; 849 ld(R3, offset, src); offset += 8; 850 ld(R4, offset, src); offset += 8; 851 ld(R5, offset, src); offset += 8; 852 ld(R6, offset, src); offset += 8; 853 ld(R7, offset, src); offset += 8; 854 ld(R8, offset, src); offset += 8; 855 ld(R9, offset, src); offset += 8; 856 ld(R10, offset, src); offset += 8; 857 ld(R11, offset, src); offset += 8; 858 ld(R12, offset, src); offset += 8; 859 860 lfd(F0, offset, src); offset += 8; 861 lfd(F1, offset, src); offset += 8; 862 lfd(F2, offset, src); offset += 8; 863 lfd(F3, offset, src); offset += 8; 864 lfd(F4, offset, src); offset += 8; 865 lfd(F5, offset, src); offset += 8; 866 lfd(F6, offset, src); offset += 8; 867 lfd(F7, offset, src); offset += 8; 868 lfd(F8, offset, src); offset += 8; 869 lfd(F9, offset, src); offset += 8; 870 lfd(F10, offset, src); offset += 8; 871 lfd(F11, offset, src); offset += 8; 872 lfd(F12, offset, src); offset += 8; 873 lfd(F13, offset, src); 874 } 875 876 void MacroAssembler::save_LR_CR(Register tmp) { 877 mfcr(tmp); 878 std(tmp, _abi(cr), R1_SP); 879 mflr(tmp); 880 std(tmp, _abi(lr), R1_SP); 881 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 882 } 883 884 void MacroAssembler::restore_LR_CR(Register tmp) { 885 assert(tmp != R1_SP, "must be distinct"); 886 ld(tmp, _abi(lr), R1_SP); 887 mtlr(tmp); 888 ld(tmp, _abi(cr), R1_SP); 889 mtcr(tmp); 890 } 891 892 address MacroAssembler::get_PC_trash_LR(Register result) { 893 Label L; 894 bl(L); 895 bind(L); 896 address lr_pc = pc(); 897 mflr(result); 898 return lr_pc; 899 } 900 901 void MacroAssembler::resize_frame(Register offset, Register tmp) { 902 #ifdef ASSERT 903 assert_different_registers(offset, tmp, R1_SP); 904 andi_(tmp, offset, frame::alignment_in_bytes-1); 905 asm_assert_eq("resize_frame: unaligned", 0x204); 906 #endif 907 908 // tmp <- *(SP) 909 ld(tmp, _abi(callers_sp), R1_SP); 910 // addr <- SP + offset; 911 // *(addr) <- tmp; 912 // SP <- addr 913 stdux(tmp, R1_SP, offset); 914 } 915 916 void MacroAssembler::resize_frame(int offset, Register tmp) { 917 assert(is_simm(offset, 16), "too big an offset"); 918 assert_different_registers(tmp, R1_SP); 919 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 920 // tmp <- *(SP) 921 ld(tmp, _abi(callers_sp), R1_SP); 922 // addr <- SP + offset; 923 // *(addr) <- tmp; 924 // SP <- addr 925 stdu(tmp, offset, R1_SP); 926 } 927 928 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 929 // (addr == tmp1) || (addr == tmp2) is allowed here! 930 assert(tmp1 != tmp2, "must be distinct"); 931 932 // compute offset w.r.t. current stack pointer 933 // tmp_1 <- addr - SP (!) 934 subf(tmp1, R1_SP, addr); 935 936 // atomically update SP keeping back link. 937 resize_frame(tmp1/* offset */, tmp2/* tmp */); 938 } 939 940 void MacroAssembler::push_frame(Register bytes, Register tmp) { 941 #ifdef ASSERT 942 assert(bytes != R0, "r0 not allowed here"); 943 andi_(R0, bytes, frame::alignment_in_bytes-1); 944 asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203); 945 #endif 946 neg(tmp, bytes); 947 stdux(R1_SP, R1_SP, tmp); 948 } 949 950 // Push a frame of size `bytes'. 951 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 952 long offset = align_addr(bytes, frame::alignment_in_bytes); 953 if (is_simm(-offset, 16)) { 954 stdu(R1_SP, -offset, R1_SP); 955 } else { 956 load_const_optimized(tmp, -offset); 957 stdux(R1_SP, R1_SP, tmp); 958 } 959 } 960 961 // Push a frame of size `bytes' plus abi_reg_args on top. 962 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 963 push_frame(bytes + frame::abi_reg_args_size, tmp); 964 } 965 966 // Setup up a new C frame with a spill area for non-volatile GPRs and 967 // additional space for local variables. 968 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 969 Register tmp) { 970 push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 971 } 972 973 // Pop current C frame. 974 void MacroAssembler::pop_frame() { 975 ld(R1_SP, _abi(callers_sp), R1_SP); 976 } 977 978 #if defined(ABI_ELFv2) 979 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 980 // TODO(asmundak): make sure the caller uses R12 as function descriptor 981 // most of the times. 982 if (R12 != r_function_entry) { 983 mr(R12, r_function_entry); 984 } 985 mtctr(R12); 986 // Do a call or a branch. 987 if (and_link) { 988 bctrl(); 989 } else { 990 bctr(); 991 } 992 _last_calls_return_pc = pc(); 993 994 return _last_calls_return_pc; 995 } 996 997 // Call a C function via a function descriptor and use full C 998 // calling conventions. Updates and returns _last_calls_return_pc. 999 address MacroAssembler::call_c(Register r_function_entry) { 1000 return branch_to(r_function_entry, /*and_link=*/true); 1001 } 1002 1003 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1004 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1005 return branch_to(r_function_entry, /*and_link=*/false); 1006 } 1007 1008 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1009 load_const(R12, function_entry, R0); 1010 return branch_to(R12, /*and_link=*/true); 1011 } 1012 1013 #else 1014 // Generic version of a call to C function via a function descriptor 1015 // with variable support for C calling conventions (TOC, ENV, etc.). 1016 // Updates and returns _last_calls_return_pc. 1017 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1018 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1019 // we emit standard ptrgl glue code here 1020 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1021 1022 // retrieve necessary entries from the function descriptor 1023 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1024 mtctr(R0); 1025 1026 if (load_toc_of_callee) { 1027 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1028 } 1029 if (load_env_of_callee) { 1030 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1031 } else if (load_toc_of_callee) { 1032 li(R11, 0); 1033 } 1034 1035 // do a call or a branch 1036 if (and_link) { 1037 bctrl(); 1038 } else { 1039 bctr(); 1040 } 1041 _last_calls_return_pc = pc(); 1042 1043 return _last_calls_return_pc; 1044 } 1045 1046 // Call a C function via a function descriptor and use full C calling 1047 // conventions. 1048 // We don't use the TOC in generated code, so there is no need to save 1049 // and restore its value. 1050 address MacroAssembler::call_c(Register fd) { 1051 return branch_to(fd, /*and_link=*/true, 1052 /*save toc=*/false, 1053 /*restore toc=*/false, 1054 /*load toc=*/true, 1055 /*load env=*/true); 1056 } 1057 1058 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1059 return branch_to(fd, /*and_link=*/false, 1060 /*save toc=*/false, 1061 /*restore toc=*/false, 1062 /*load toc=*/true, 1063 /*load env=*/true); 1064 } 1065 1066 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1067 if (rt != relocInfo::none) { 1068 // this call needs to be relocatable 1069 if (!ReoptimizeCallSequences 1070 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1071 || fd == NULL // support code-size estimation 1072 || !fd->is_friend_function() 1073 || fd->entry() == NULL) { 1074 // it's not a friend function as defined by class FunctionDescriptor, 1075 // so do a full call-c here. 1076 load_const(R11, (address)fd, R0); 1077 1078 bool has_env = (fd != NULL && fd->env() != NULL); 1079 return branch_to(R11, /*and_link=*/true, 1080 /*save toc=*/false, 1081 /*restore toc=*/false, 1082 /*load toc=*/true, 1083 /*load env=*/has_env); 1084 } else { 1085 // It's a friend function. Load the entry point and don't care about 1086 // toc and env. Use an optimizable call instruction, but ensure the 1087 // same code-size as in the case of a non-friend function. 1088 nop(); 1089 nop(); 1090 nop(); 1091 bl64_patchable(fd->entry(), rt); 1092 _last_calls_return_pc = pc(); 1093 return _last_calls_return_pc; 1094 } 1095 } else { 1096 // This call does not need to be relocatable, do more aggressive 1097 // optimizations. 1098 if (!ReoptimizeCallSequences 1099 || !fd->is_friend_function()) { 1100 // It's not a friend function as defined by class FunctionDescriptor, 1101 // so do a full call-c here. 1102 load_const(R11, (address)fd, R0); 1103 return branch_to(R11, /*and_link=*/true, 1104 /*save toc=*/false, 1105 /*restore toc=*/false, 1106 /*load toc=*/true, 1107 /*load env=*/true); 1108 } else { 1109 // it's a friend function, load the entry point and don't care about 1110 // toc and env. 1111 address dest = fd->entry(); 1112 if (is_within_range_of_b(dest, pc())) { 1113 bl(dest); 1114 } else { 1115 bl64_patchable(dest, rt); 1116 } 1117 _last_calls_return_pc = pc(); 1118 return _last_calls_return_pc; 1119 } 1120 } 1121 } 1122 1123 // Call a C function. All constants needed reside in TOC. 1124 // 1125 // Read the address to call from the TOC. 1126 // Read env from TOC, if fd specifies an env. 1127 // Read new TOC from TOC. 1128 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1129 relocInfo::relocType rt, Register toc) { 1130 if (!ReoptimizeCallSequences 1131 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1132 || !fd->is_friend_function()) { 1133 // It's not a friend function as defined by class FunctionDescriptor, 1134 // so do a full call-c here. 1135 assert(fd->entry() != NULL, "function must be linked"); 1136 1137 AddressLiteral fd_entry(fd->entry()); 1138 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1139 mtctr(R11); 1140 if (fd->env() == NULL) { 1141 li(R11, 0); 1142 nop(); 1143 } else { 1144 AddressLiteral fd_env(fd->env()); 1145 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1146 } 1147 AddressLiteral fd_toc(fd->toc()); 1148 // Set R2_TOC (load from toc) 1149 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1150 bctrl(); 1151 _last_calls_return_pc = pc(); 1152 if (!success) { return NULL; } 1153 } else { 1154 // It's a friend function, load the entry point and don't care about 1155 // toc and env. Use an optimizable call instruction, but ensure the 1156 // same code-size as in the case of a non-friend function. 1157 nop(); 1158 bl64_patchable(fd->entry(), rt); 1159 _last_calls_return_pc = pc(); 1160 } 1161 return _last_calls_return_pc; 1162 } 1163 #endif // ABI_ELFv2 1164 1165 void MacroAssembler::call_VM_base(Register oop_result, 1166 Register last_java_sp, 1167 address entry_point, 1168 bool check_exceptions) { 1169 BLOCK_COMMENT("call_VM {"); 1170 // Determine last_java_sp register. 1171 if (!last_java_sp->is_valid()) { 1172 last_java_sp = R1_SP; 1173 } 1174 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1175 1176 // ARG1 must hold thread address. 1177 mr(R3_ARG1, R16_thread); 1178 #if defined(ABI_ELFv2) 1179 address return_pc = call_c(entry_point, relocInfo::none); 1180 #else 1181 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1182 #endif 1183 1184 reset_last_Java_frame(); 1185 1186 // Check for pending exceptions. 1187 if (check_exceptions) { 1188 // We don't check for exceptions here. 1189 ShouldNotReachHere(); 1190 } 1191 1192 // Get oop result if there is one and reset the value in the thread. 1193 if (oop_result->is_valid()) { 1194 get_vm_result(oop_result); 1195 } 1196 1197 _last_calls_return_pc = return_pc; 1198 BLOCK_COMMENT("} call_VM"); 1199 } 1200 1201 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1202 BLOCK_COMMENT("call_VM_leaf {"); 1203 #if defined(ABI_ELFv2) 1204 call_c(entry_point, relocInfo::none); 1205 #else 1206 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1207 #endif 1208 BLOCK_COMMENT("} call_VM_leaf"); 1209 } 1210 1211 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1212 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1213 } 1214 1215 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1216 bool check_exceptions) { 1217 // R3_ARG1 is reserved for the thread. 1218 mr_if_needed(R4_ARG2, arg_1); 1219 call_VM(oop_result, entry_point, check_exceptions); 1220 } 1221 1222 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1223 bool check_exceptions) { 1224 // R3_ARG1 is reserved for the thread 1225 mr_if_needed(R4_ARG2, arg_1); 1226 assert(arg_2 != R4_ARG2, "smashed argument"); 1227 mr_if_needed(R5_ARG3, arg_2); 1228 call_VM(oop_result, entry_point, check_exceptions); 1229 } 1230 1231 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1232 bool check_exceptions) { 1233 // R3_ARG1 is reserved for the thread 1234 mr_if_needed(R4_ARG2, arg_1); 1235 assert(arg_2 != R4_ARG2, "smashed argument"); 1236 mr_if_needed(R5_ARG3, arg_2); 1237 mr_if_needed(R6_ARG4, arg_3); 1238 call_VM(oop_result, entry_point, check_exceptions); 1239 } 1240 1241 void MacroAssembler::call_VM_leaf(address entry_point) { 1242 call_VM_leaf_base(entry_point); 1243 } 1244 1245 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1246 mr_if_needed(R3_ARG1, arg_1); 1247 call_VM_leaf(entry_point); 1248 } 1249 1250 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1251 mr_if_needed(R3_ARG1, arg_1); 1252 assert(arg_2 != R3_ARG1, "smashed argument"); 1253 mr_if_needed(R4_ARG2, arg_2); 1254 call_VM_leaf(entry_point); 1255 } 1256 1257 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1258 mr_if_needed(R3_ARG1, arg_1); 1259 assert(arg_2 != R3_ARG1, "smashed argument"); 1260 mr_if_needed(R4_ARG2, arg_2); 1261 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1262 mr_if_needed(R5_ARG3, arg_3); 1263 call_VM_leaf(entry_point); 1264 } 1265 1266 // Check whether instruction is a read access to the polling page 1267 // which was emitted by load_from_polling_page(..). 1268 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1269 address* polling_address_ptr) { 1270 if (!is_ld(instruction)) 1271 return false; // It's not a ld. Fail. 1272 1273 int rt = inv_rt_field(instruction); 1274 int ra = inv_ra_field(instruction); 1275 int ds = inv_ds_field(instruction); 1276 if (!(ds == 0 && ra != 0 && rt == 0)) { 1277 return false; // It's not a ld(r0, X, ra). Fail. 1278 } 1279 1280 if (!ucontext) { 1281 // Set polling address. 1282 if (polling_address_ptr != NULL) { 1283 *polling_address_ptr = NULL; 1284 } 1285 return true; // No ucontext given. Can't check value of ra. Assume true. 1286 } 1287 1288 #ifdef LINUX 1289 // Ucontext given. Check that register ra contains the address of 1290 // the safepoing polling page. 1291 ucontext_t* uc = (ucontext_t*) ucontext; 1292 // Set polling address. 1293 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1294 if (polling_address_ptr != NULL) { 1295 *polling_address_ptr = addr; 1296 } 1297 return os::is_poll_address(addr); 1298 #else 1299 // Not on Linux, ucontext must be NULL. 1300 ShouldNotReachHere(); 1301 return false; 1302 #endif 1303 } 1304 1305 void MacroAssembler::bang_stack_with_offset(int offset) { 1306 // When increasing the stack, the old stack pointer will be written 1307 // to the new top of stack according to the PPC64 abi. 1308 // Therefore, stack banging is not necessary when increasing 1309 // the stack by <= os::vm_page_size() bytes. 1310 // When increasing the stack by a larger amount, this method is 1311 // called repeatedly to bang the intermediate pages. 1312 1313 // Stack grows down, caller passes positive offset. 1314 assert(offset > 0, "must bang with positive offset"); 1315 1316 long stdoffset = -offset; 1317 1318 if (is_simm(stdoffset, 16)) { 1319 // Signed 16 bit offset, a simple std is ok. 1320 if (UseLoadInstructionsForStackBangingPPC64) { 1321 ld(R0, (int)(signed short)stdoffset, R1_SP); 1322 } else { 1323 std(R0,(int)(signed short)stdoffset, R1_SP); 1324 } 1325 } else if (is_simm(stdoffset, 31)) { 1326 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1327 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1328 1329 Register tmp = R11; 1330 addis(tmp, R1_SP, hi); 1331 if (UseLoadInstructionsForStackBangingPPC64) { 1332 ld(R0, lo, tmp); 1333 } else { 1334 std(R0, lo, tmp); 1335 } 1336 } else { 1337 ShouldNotReachHere(); 1338 } 1339 } 1340 1341 // If instruction is a stack bang of the form 1342 // std R0, x(Ry), (see bang_stack_with_offset()) 1343 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1344 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1345 // return the banged address. Otherwise, return 0. 1346 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1347 #ifdef LINUX 1348 ucontext_t* uc = (ucontext_t*) ucontext; 1349 int rs = inv_rs_field(instruction); 1350 int ra = inv_ra_field(instruction); 1351 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1352 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1353 || (is_stdu(instruction) && rs == 1)) { 1354 int ds = inv_ds_field(instruction); 1355 // return banged address 1356 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1357 } else if (is_stdux(instruction) && rs == 1) { 1358 int rb = inv_rb_field(instruction); 1359 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1360 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1361 return ra != 1 || rb_val >= 0 ? NULL // not a stack bang 1362 : sp + rb_val; // banged address 1363 } 1364 return NULL; // not a stack bang 1365 #else 1366 // workaround not needed on !LINUX :-) 1367 ShouldNotCallThis(); 1368 return NULL; 1369 #endif 1370 } 1371 1372 void MacroAssembler::reserved_stack_check(Register return_pc) { 1373 // Test if reserved zone needs to be enabled. 1374 Label no_reserved_zone_enabling; 1375 1376 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1377 cmpld(CCR0, R1_SP, R0); 1378 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1379 1380 // Enable reserved zone again, throw stack overflow exception. 1381 push_frame_reg_args(0, R0); 1382 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1383 pop_frame(); 1384 mtlr(return_pc); 1385 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1386 mtctr(R0); 1387 bctr(); 1388 1389 should_not_reach_here(); 1390 1391 bind(no_reserved_zone_enabling); 1392 } 1393 1394 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1395 bool cmpxchgx_hint) { 1396 Label retry; 1397 bind(retry); 1398 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1399 stdcx_(exchange_value, addr_base); 1400 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1401 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1402 } else { 1403 bne( CCR0, retry); // StXcx_ sets CCR0. 1404 } 1405 } 1406 1407 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1408 Register tmp, bool cmpxchgx_hint) { 1409 Label retry; 1410 bind(retry); 1411 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1412 add(tmp, dest_current_value, inc_value); 1413 stdcx_(tmp, addr_base); 1414 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1415 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1416 } else { 1417 bne( CCR0, retry); // StXcx_ sets CCR0. 1418 } 1419 } 1420 1421 // Word/sub-word atomic helper functions 1422 1423 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1424 // Only signed types are supported with size < 4. 1425 // Atomic add always kills tmp1. 1426 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1427 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1428 bool cmpxchgx_hint, bool is_add, int size) { 1429 // Sub-word instructions are available since Power 8. 1430 // For older processors, instruction_type != size holds, and we 1431 // emulate the sub-word instructions by constructing a 4-byte value 1432 // that leaves the other bytes unchanged. 1433 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1434 1435 Label retry; 1436 Register shift_amount = noreg, 1437 val32 = dest_current_value, 1438 modval = is_add ? tmp1 : exchange_value; 1439 1440 if (instruction_type != size) { 1441 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1442 modval = tmp1; 1443 shift_amount = tmp2; 1444 val32 = tmp3; 1445 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1446 #ifdef VM_LITTLE_ENDIAN 1447 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1448 clrrdi(addr_base, addr_base, 2); 1449 #else 1450 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1451 clrrdi(addr_base, addr_base, 2); 1452 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1453 #endif 1454 } 1455 1456 // atomic emulation loop 1457 bind(retry); 1458 1459 switch (instruction_type) { 1460 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1461 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1462 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1463 default: ShouldNotReachHere(); 1464 } 1465 1466 if (instruction_type != size) { 1467 srw(dest_current_value, val32, shift_amount); 1468 } 1469 1470 if (is_add) { add(modval, dest_current_value, exchange_value); } 1471 1472 if (instruction_type != size) { 1473 // Transform exchange value such that the replacement can be done by one xor instruction. 1474 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1475 clrldi(modval, modval, (size == 1) ? 56 : 48); 1476 slw(modval, modval, shift_amount); 1477 xorr(modval, val32, modval); 1478 } 1479 1480 switch (instruction_type) { 1481 case 4: stwcx_(modval, addr_base); break; 1482 case 2: sthcx_(modval, addr_base); break; 1483 case 1: stbcx_(modval, addr_base); break; 1484 default: ShouldNotReachHere(); 1485 } 1486 1487 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1488 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1489 } else { 1490 bne( CCR0, retry); // StXcx_ sets CCR0. 1491 } 1492 1493 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1494 if (size == 1) { 1495 extsb(dest_current_value, dest_current_value); 1496 } else if (size == 2) { 1497 extsh(dest_current_value, dest_current_value); 1498 }; 1499 } 1500 1501 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1502 // Only signed types are supported with size < 4. 1503 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1504 Register compare_value, Register exchange_value, 1505 Register addr_base, Register tmp1, Register tmp2, 1506 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1507 // Sub-word instructions are available since Power 8. 1508 // For older processors, instruction_type != size holds, and we 1509 // emulate the sub-word instructions by constructing a 4-byte value 1510 // that leaves the other bytes unchanged. 1511 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1512 1513 Register shift_amount = noreg, 1514 val32 = dest_current_value, 1515 modval = exchange_value; 1516 1517 if (instruction_type != size) { 1518 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1519 shift_amount = tmp1; 1520 val32 = tmp2; 1521 modval = tmp2; 1522 // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1523 #ifdef VM_LITTLE_ENDIAN 1524 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1525 clrrdi(addr_base, addr_base, 2); 1526 #else 1527 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1528 clrrdi(addr_base, addr_base, 2); 1529 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1530 #endif 1531 // Transform exchange value such that the replacement can be done by one xor instruction. 1532 xorr(exchange_value, compare_value, exchange_value); 1533 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1534 slw(exchange_value, exchange_value, shift_amount); 1535 } 1536 1537 // atomic emulation loop 1538 bind(retry); 1539 1540 switch (instruction_type) { 1541 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1542 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1543 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1544 default: ShouldNotReachHere(); 1545 } 1546 1547 if (instruction_type != size) { 1548 srw(dest_current_value, val32, shift_amount); 1549 } 1550 if (size == 1) { 1551 extsb(dest_current_value, dest_current_value); 1552 } else if (size == 2) { 1553 extsh(dest_current_value, dest_current_value); 1554 }; 1555 1556 cmpw(flag, dest_current_value, compare_value); 1557 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1558 bne_predict_not_taken(flag, failed); 1559 } else { 1560 bne( flag, failed); 1561 } 1562 // branch to done => (flag == ne), (dest_current_value != compare_value) 1563 // fall through => (flag == eq), (dest_current_value == compare_value) 1564 1565 if (instruction_type != size) { 1566 xorr(modval, val32, exchange_value); 1567 } 1568 1569 switch (instruction_type) { 1570 case 4: stwcx_(modval, addr_base); break; 1571 case 2: sthcx_(modval, addr_base); break; 1572 case 1: stbcx_(modval, addr_base); break; 1573 default: ShouldNotReachHere(); 1574 } 1575 } 1576 1577 // CmpxchgX sets condition register to cmpX(current, compare). 1578 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1579 Register compare_value, Register exchange_value, 1580 Register addr_base, Register tmp1, Register tmp2, 1581 int semantics, bool cmpxchgx_hint, 1582 Register int_flag_success, bool contention_hint, bool weak, int size) { 1583 Label retry; 1584 Label failed; 1585 Label done; 1586 1587 // Save one branch if result is returned via register and 1588 // result register is different from the other ones. 1589 bool use_result_reg = (int_flag_success != noreg); 1590 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1591 int_flag_success != exchange_value && int_flag_success != addr_base && 1592 int_flag_success != tmp1 && int_flag_success != tmp2); 1593 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1594 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1595 1596 if (use_result_reg && preset_result_reg) { 1597 li(int_flag_success, 0); // preset (assume cas failed) 1598 } 1599 1600 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1601 if (contention_hint) { // Don't try to reserve if cmp fails. 1602 switch (size) { 1603 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1604 case 2: lha(dest_current_value, 0, addr_base); break; 1605 case 4: lwz(dest_current_value, 0, addr_base); break; 1606 default: ShouldNotReachHere(); 1607 } 1608 cmpw(flag, dest_current_value, compare_value); 1609 bne(flag, failed); 1610 } 1611 1612 // release/fence semantics 1613 if (semantics & MemBarRel) { 1614 release(); 1615 } 1616 1617 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1618 retry, failed, cmpxchgx_hint, size); 1619 if (!weak || use_result_reg) { 1620 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1621 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1622 } else { 1623 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1624 } 1625 } 1626 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1627 1628 // Result in register (must do this at the end because int_flag_success can be the 1629 // same register as one above). 1630 if (use_result_reg) { 1631 li(int_flag_success, 1); 1632 } 1633 1634 if (semantics & MemBarFenceAfter) { 1635 fence(); 1636 } else if (semantics & MemBarAcq) { 1637 isync(); 1638 } 1639 1640 if (use_result_reg && !preset_result_reg) { 1641 b(done); 1642 } 1643 1644 bind(failed); 1645 if (use_result_reg && !preset_result_reg) { 1646 li(int_flag_success, 0); 1647 } 1648 1649 bind(done); 1650 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1651 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1652 } 1653 1654 // Preforms atomic compare exchange: 1655 // if (compare_value == *addr_base) 1656 // *addr_base = exchange_value 1657 // int_flag_success = 1; 1658 // else 1659 // int_flag_success = 0; 1660 // 1661 // ConditionRegister flag = cmp(compare_value, *addr_base) 1662 // Register dest_current_value = *addr_base 1663 // Register compare_value Used to compare with value in memory 1664 // Register exchange_value Written to memory if compare_value == *addr_base 1665 // Register addr_base The memory location to compareXChange 1666 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1667 // 1668 // To avoid the costly compare exchange the value is tested beforehand. 1669 // Several special cases exist to avoid that unnecessary information is generated. 1670 // 1671 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1672 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1673 Register addr_base, int semantics, bool cmpxchgx_hint, 1674 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1675 Label retry; 1676 Label failed_int; 1677 Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int; 1678 Label done; 1679 1680 // Save one branch if result is returned via register and result register is different from the other ones. 1681 bool use_result_reg = (int_flag_success!=noreg); 1682 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1683 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1684 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1685 assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); 1686 1687 if (use_result_reg && preset_result_reg) { 1688 li(int_flag_success, 0); // preset (assume cas failed) 1689 } 1690 1691 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1692 if (contention_hint) { // Don't try to reserve if cmp fails. 1693 ld(dest_current_value, 0, addr_base); 1694 cmpd(flag, compare_value, dest_current_value); 1695 bne(flag, failed); 1696 } 1697 1698 // release/fence semantics 1699 if (semantics & MemBarRel) { 1700 release(); 1701 } 1702 1703 // atomic emulation loop 1704 bind(retry); 1705 1706 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1707 cmpd(flag, compare_value, dest_current_value); 1708 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1709 bne_predict_not_taken(flag, failed); 1710 } else { 1711 bne( flag, failed); 1712 } 1713 1714 stdcx_(exchange_value, addr_base); 1715 if (!weak || use_result_reg || failed_ext) { 1716 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1717 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1718 } else { 1719 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1720 } 1721 } 1722 1723 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1724 if (use_result_reg) { 1725 li(int_flag_success, 1); 1726 } 1727 1728 if (semantics & MemBarFenceAfter) { 1729 fence(); 1730 } else if (semantics & MemBarAcq) { 1731 isync(); 1732 } 1733 1734 if (use_result_reg && !preset_result_reg) { 1735 b(done); 1736 } 1737 1738 bind(failed_int); 1739 if (use_result_reg && !preset_result_reg) { 1740 li(int_flag_success, 0); 1741 } 1742 1743 bind(done); 1744 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1745 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1746 } 1747 1748 // Look up the method for a megamorphic invokeinterface call. 1749 // The target method is determined by <intf_klass, itable_index>. 1750 // The receiver klass is in recv_klass. 1751 // On success, the result will be in method_result, and execution falls through. 1752 // On failure, execution transfers to the given label. 1753 void MacroAssembler::lookup_interface_method(Register recv_klass, 1754 Register intf_klass, 1755 RegisterOrConstant itable_index, 1756 Register method_result, 1757 Register scan_temp, 1758 Register temp2, 1759 Label& L_no_such_interface, 1760 bool return_method) { 1761 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1762 1763 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1764 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1765 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1766 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1767 int scan_step = itableOffsetEntry::size() * wordSize; 1768 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1769 1770 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1771 // %%% We should store the aligned, prescaled offset in the klassoop. 1772 // Then the next several instructions would fold away. 1773 1774 sldi(scan_temp, scan_temp, log_vte_size); 1775 addi(scan_temp, scan_temp, vtable_base); 1776 add(scan_temp, recv_klass, scan_temp); 1777 1778 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1779 if (return_method) { 1780 if (itable_index.is_register()) { 1781 Register itable_offset = itable_index.as_register(); 1782 sldi(method_result, itable_offset, logMEsize); 1783 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1784 add(method_result, method_result, recv_klass); 1785 } else { 1786 long itable_offset = (long)itable_index.as_constant(); 1787 // static address, no relocation 1788 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1789 } 1790 } 1791 1792 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1793 // if (scan->interface() == intf) { 1794 // result = (klass + scan->offset() + itable_index); 1795 // } 1796 // } 1797 Label search, found_method; 1798 1799 for (int peel = 1; peel >= 0; peel--) { 1800 // %%%% Could load both offset and interface in one ldx, if they were 1801 // in the opposite order. This would save a load. 1802 ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp); 1803 1804 // Check that this entry is non-null. A null entry means that 1805 // the receiver class doesn't implement the interface, and wasn't the 1806 // same as when the caller was compiled. 1807 cmpd(CCR0, temp2, intf_klass); 1808 1809 if (peel) { 1810 beq(CCR0, found_method); 1811 } else { 1812 bne(CCR0, search); 1813 // (invert the test to fall through to found_method...) 1814 } 1815 1816 if (!peel) break; 1817 1818 bind(search); 1819 1820 cmpdi(CCR0, temp2, 0); 1821 beq(CCR0, L_no_such_interface); 1822 addi(scan_temp, scan_temp, scan_step); 1823 } 1824 1825 bind(found_method); 1826 1827 // Got a hit. 1828 if (return_method) { 1829 int ito_offset = itableOffsetEntry::offset_offset_in_bytes(); 1830 lwz(scan_temp, ito_offset, scan_temp); 1831 ldx(method_result, scan_temp, method_result); 1832 } 1833 } 1834 1835 // virtual method calling 1836 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1837 RegisterOrConstant vtable_index, 1838 Register method_result) { 1839 1840 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1841 1842 const int base = in_bytes(Klass::vtable_start_offset()); 1843 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1844 1845 if (vtable_index.is_register()) { 1846 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1847 add(recv_klass, vtable_index.as_register(), recv_klass); 1848 } else { 1849 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1850 } 1851 ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass); 1852 } 1853 1854 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1855 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1856 Register super_klass, 1857 Register temp1_reg, 1858 Register temp2_reg, 1859 Label* L_success, 1860 Label* L_failure, 1861 Label* L_slow_path, 1862 RegisterOrConstant super_check_offset) { 1863 1864 const Register check_cache_offset = temp1_reg; 1865 const Register cached_super = temp2_reg; 1866 1867 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1868 1869 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1870 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1871 1872 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1873 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1874 1875 Label L_fallthrough; 1876 int label_nulls = 0; 1877 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1878 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1879 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1880 assert(label_nulls <= 1 || 1881 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1882 "at most one NULL in the batch, usually"); 1883 1884 // If the pointers are equal, we are done (e.g., String[] elements). 1885 // This self-check enables sharing of secondary supertype arrays among 1886 // non-primary types such as array-of-interface. Otherwise, each such 1887 // type would need its own customized SSA. 1888 // We move this check to the front of the fast path because many 1889 // type checks are in fact trivially successful in this manner, 1890 // so we get a nicely predicted branch right at the start of the check. 1891 cmpd(CCR0, sub_klass, super_klass); 1892 beq(CCR0, *L_success); 1893 1894 // Check the supertype display: 1895 if (must_load_sco) { 1896 // The super check offset is always positive... 1897 lwz(check_cache_offset, sco_offset, super_klass); 1898 super_check_offset = RegisterOrConstant(check_cache_offset); 1899 // super_check_offset is register. 1900 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1901 } 1902 // The loaded value is the offset from KlassOopDesc. 1903 1904 ld(cached_super, super_check_offset, sub_klass); 1905 cmpd(CCR0, cached_super, super_klass); 1906 1907 // This check has worked decisively for primary supers. 1908 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1909 // (Secondary supers are interfaces and very deeply nested subtypes.) 1910 // This works in the same check above because of a tricky aliasing 1911 // between the super_cache and the primary super display elements. 1912 // (The 'super_check_addr' can address either, as the case requires.) 1913 // Note that the cache is updated below if it does not help us find 1914 // what we need immediately. 1915 // So if it was a primary super, we can just fail immediately. 1916 // Otherwise, it's the slow path for us (no success at this point). 1917 1918 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1919 1920 if (super_check_offset.is_register()) { 1921 beq(CCR0, *L_success); 1922 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1923 if (L_failure == &L_fallthrough) { 1924 beq(CCR0, *L_slow_path); 1925 } else { 1926 bne(CCR0, *L_failure); 1927 FINAL_JUMP(*L_slow_path); 1928 } 1929 } else { 1930 if (super_check_offset.as_constant() == sc_offset) { 1931 // Need a slow path; fast failure is impossible. 1932 if (L_slow_path == &L_fallthrough) { 1933 beq(CCR0, *L_success); 1934 } else { 1935 bne(CCR0, *L_slow_path); 1936 FINAL_JUMP(*L_success); 1937 } 1938 } else { 1939 // No slow path; it's a fast decision. 1940 if (L_failure == &L_fallthrough) { 1941 beq(CCR0, *L_success); 1942 } else { 1943 bne(CCR0, *L_failure); 1944 FINAL_JUMP(*L_success); 1945 } 1946 } 1947 } 1948 1949 bind(L_fallthrough); 1950 #undef FINAL_JUMP 1951 } 1952 1953 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1954 Register super_klass, 1955 Register temp1_reg, 1956 Register temp2_reg, 1957 Label* L_success, 1958 Register result_reg) { 1959 const Register array_ptr = temp1_reg; // current value from cache array 1960 const Register temp = temp2_reg; 1961 1962 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1963 1964 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1965 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1966 1967 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1968 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1969 1970 Label hit, loop, failure, fallthru; 1971 1972 ld(array_ptr, source_offset, sub_klass); 1973 1974 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 1975 lwz(temp, length_offset, array_ptr); 1976 cmpwi(CCR0, temp, 0); 1977 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 1978 1979 mtctr(temp); // load ctr 1980 1981 bind(loop); 1982 // Oops in table are NO MORE compressed. 1983 ld(temp, base_offset, array_ptr); 1984 cmpd(CCR0, temp, super_klass); 1985 beq(CCR0, hit); 1986 addi(array_ptr, array_ptr, BytesPerWord); 1987 bdnz(loop); 1988 1989 bind(failure); 1990 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 1991 b(fallthru); 1992 1993 bind(hit); 1994 std(super_klass, target_offset, sub_klass); // save result to cache 1995 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 1996 if (L_success != NULL) { b(*L_success); } 1997 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 1998 1999 bind(fallthru); 2000 } 2001 2002 // Try fast path, then go to slow one if not successful 2003 void MacroAssembler::check_klass_subtype(Register sub_klass, 2004 Register super_klass, 2005 Register temp1_reg, 2006 Register temp2_reg, 2007 Label& L_success) { 2008 Label L_failure; 2009 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2010 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2011 bind(L_failure); // Fallthru if not successful. 2012 } 2013 2014 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 2015 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required"); 2016 2017 Label L_fallthrough; 2018 if (L_fast_path == NULL) { 2019 L_fast_path = &L_fallthrough; 2020 } else if (L_slow_path == NULL) { 2021 L_slow_path = &L_fallthrough; 2022 } 2023 2024 // Fast path check: class is fully initialized 2025 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass); 2026 cmpwi(CCR0, R0, InstanceKlass::fully_initialized); 2027 beq(CCR0, *L_fast_path); 2028 2029 // Fast path check: current thread is initializer thread 2030 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass); 2031 cmpd(CCR0, thread, R0); 2032 if (L_slow_path == &L_fallthrough) { 2033 beq(CCR0, *L_fast_path); 2034 } else if (L_fast_path == &L_fallthrough) { 2035 bne(CCR0, *L_slow_path); 2036 } else { 2037 Unimplemented(); 2038 } 2039 2040 bind(L_fallthrough); 2041 } 2042 2043 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2044 Register temp_reg, 2045 int extra_slot_offset) { 2046 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2047 int stackElementSize = Interpreter::stackElementSize; 2048 int offset = extra_slot_offset * stackElementSize; 2049 if (arg_slot.is_constant()) { 2050 offset += arg_slot.as_constant() * stackElementSize; 2051 return offset; 2052 } else { 2053 assert(temp_reg != noreg, "must specify"); 2054 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2055 if (offset != 0) 2056 addi(temp_reg, temp_reg, offset); 2057 return temp_reg; 2058 } 2059 } 2060 2061 // Supports temp2_reg = R0. 2062 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg, 2063 Register mark_reg, Register temp_reg, 2064 Register temp2_reg, Label& done, Label* slow_case) { 2065 assert(UseBiasedLocking, "why call this otherwise?"); 2066 2067 #ifdef ASSERT 2068 assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg); 2069 #endif 2070 2071 Label cas_label; 2072 2073 // Branch to done if fast path fails and no slow_case provided. 2074 Label *slow_case_int = (slow_case != NULL) ? slow_case : &done; 2075 2076 // Biased locking 2077 // See whether the lock is currently biased toward our thread and 2078 // whether the epoch is still valid 2079 // Note that the runtime guarantees sufficient alignment of JavaThread 2080 // pointers to allow age to be placed into low bits 2081 assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, 2082 "biased locking makes assumptions about bit layout"); 2083 2084 if (PrintBiasedLockingStatistics) { 2085 load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg); 2086 lwzx(temp_reg, temp2_reg); 2087 addi(temp_reg, temp_reg, 1); 2088 stwx(temp_reg, temp2_reg); 2089 } 2090 2091 andi(temp_reg, mark_reg, markWord::biased_lock_mask_in_place); 2092 cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern); 2093 bne(cr_reg, cas_label); 2094 2095 load_klass(temp_reg, obj_reg); 2096 2097 load_const_optimized(temp2_reg, ~((int) markWord::age_mask_in_place)); 2098 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2099 orr(temp_reg, R16_thread, temp_reg); 2100 xorr(temp_reg, mark_reg, temp_reg); 2101 andr(temp_reg, temp_reg, temp2_reg); 2102 cmpdi(cr_reg, temp_reg, 0); 2103 if (PrintBiasedLockingStatistics) { 2104 Label l; 2105 bne(cr_reg, l); 2106 load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr()); 2107 lwzx(mark_reg, temp2_reg); 2108 addi(mark_reg, mark_reg, 1); 2109 stwx(mark_reg, temp2_reg); 2110 // restore mark_reg 2111 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2112 bind(l); 2113 } 2114 beq(cr_reg, done); 2115 2116 Label try_revoke_bias; 2117 Label try_rebias; 2118 2119 // At this point we know that the header has the bias pattern and 2120 // that we are not the bias owner in the current epoch. We need to 2121 // figure out more details about the state of the header in order to 2122 // know what operations can be legally performed on the object's 2123 // header. 2124 2125 // If the low three bits in the xor result aren't clear, that means 2126 // the prototype header is no longer biased and we have to revoke 2127 // the bias on this object. 2128 andi(temp2_reg, temp_reg, markWord::biased_lock_mask_in_place); 2129 cmpwi(cr_reg, temp2_reg, 0); 2130 bne(cr_reg, try_revoke_bias); 2131 2132 // Biasing is still enabled for this data type. See whether the 2133 // epoch of the current bias is still valid, meaning that the epoch 2134 // bits of the mark word are equal to the epoch bits of the 2135 // prototype header. (Note that the prototype header's epoch bits 2136 // only change at a safepoint.) If not, attempt to rebias the object 2137 // toward the current thread. Note that we must be absolutely sure 2138 // that the current epoch is invalid in order to do this because 2139 // otherwise the manipulations it performs on the mark word are 2140 // illegal. 2141 2142 int shift_amount = 64 - markWord::epoch_shift; 2143 // rotate epoch bits to right (little) end and set other bits to 0 2144 // [ big part | epoch | little part ] -> [ 0..0 | epoch ] 2145 rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markWord::epoch_bits); 2146 // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented 2147 bne(CCR0, try_rebias); 2148 2149 // The epoch of the current bias is still valid but we know nothing 2150 // about the owner; it might be set or it might be clear. Try to 2151 // acquire the bias of the object using an atomic operation. If this 2152 // fails we will go in to the runtime to revoke the object's bias. 2153 // Note that we first construct the presumed unbiased header so we 2154 // don't accidentally blow away another thread's valid bias. 2155 andi(mark_reg, mark_reg, (markWord::biased_lock_mask_in_place | 2156 markWord::age_mask_in_place | 2157 markWord::epoch_mask_in_place)); 2158 orr(temp_reg, R16_thread, mark_reg); 2159 2160 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2161 2162 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2163 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2164 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2165 /*where=*/obj_reg, 2166 MacroAssembler::MemBarAcq, 2167 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2168 noreg, slow_case_int); // bail out if failed 2169 2170 // If the biasing toward our thread failed, this means that 2171 // another thread succeeded in biasing it toward itself and we 2172 // need to revoke that bias. The revocation will occur in the 2173 // interpreter runtime in the slow case. 2174 if (PrintBiasedLockingStatistics) { 2175 load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg); 2176 lwzx(temp_reg, temp2_reg); 2177 addi(temp_reg, temp_reg, 1); 2178 stwx(temp_reg, temp2_reg); 2179 } 2180 b(done); 2181 2182 bind(try_rebias); 2183 // At this point we know the epoch has expired, meaning that the 2184 // current "bias owner", if any, is actually invalid. Under these 2185 // circumstances _only_, we are allowed to use the current header's 2186 // value as the comparison value when doing the cas to acquire the 2187 // bias in the current epoch. In other words, we allow transfer of 2188 // the bias from one thread to another directly in this situation. 2189 load_klass(temp_reg, obj_reg); 2190 andi(temp2_reg, mark_reg, markWord::age_mask_in_place); 2191 orr(temp2_reg, R16_thread, temp2_reg); 2192 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2193 orr(temp_reg, temp2_reg, temp_reg); 2194 2195 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2196 2197 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2198 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2199 /*where=*/obj_reg, 2200 MacroAssembler::MemBarAcq, 2201 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2202 noreg, slow_case_int); // bail out if failed 2203 2204 // If the biasing toward our thread failed, this means that 2205 // another thread succeeded in biasing it toward itself and we 2206 // need to revoke that bias. The revocation will occur in the 2207 // interpreter runtime in the slow case. 2208 if (PrintBiasedLockingStatistics) { 2209 load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg); 2210 lwzx(temp_reg, temp2_reg); 2211 addi(temp_reg, temp_reg, 1); 2212 stwx(temp_reg, temp2_reg); 2213 } 2214 b(done); 2215 2216 bind(try_revoke_bias); 2217 // The prototype mark in the klass doesn't have the bias bit set any 2218 // more, indicating that objects of this data type are not supposed 2219 // to be biased any more. We are going to try to reset the mark of 2220 // this object to the prototype value and fall through to the 2221 // CAS-based locking scheme. Note that if our CAS fails, it means 2222 // that another thread raced us for the privilege of revoking the 2223 // bias of this particular object, so it's okay to continue in the 2224 // normal locking code. 2225 load_klass(temp_reg, obj_reg); 2226 ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg); 2227 andi(temp2_reg, mark_reg, markWord::age_mask_in_place); 2228 orr(temp_reg, temp_reg, temp2_reg); 2229 2230 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2231 2232 // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). 2233 cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, 2234 /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, 2235 /*where=*/obj_reg, 2236 MacroAssembler::MemBarAcq, 2237 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2238 2239 // reload markWord in mark_reg before continuing with lightweight locking 2240 ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg); 2241 2242 // Fall through to the normal CAS-based lock, because no matter what 2243 // the result of the above CAS, some thread must have succeeded in 2244 // removing the bias bit from the object's header. 2245 if (PrintBiasedLockingStatistics) { 2246 Label l; 2247 bne(cr_reg, l); 2248 load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg); 2249 lwzx(temp_reg, temp2_reg); 2250 addi(temp_reg, temp_reg, 1); 2251 stwx(temp_reg, temp2_reg); 2252 bind(l); 2253 } 2254 2255 bind(cas_label); 2256 } 2257 2258 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) { 2259 // Check for biased locking unlock case, which is a no-op 2260 // Note: we do not have to check the thread ID for two reasons. 2261 // First, the interpreter checks for IllegalMonitorStateException at 2262 // a higher level. Second, if the bias was revoked while we held the 2263 // lock, the object could not be rebiased toward another thread, so 2264 // the bias bit would be clear. 2265 2266 ld(temp_reg, 0, mark_addr); 2267 andi(temp_reg, temp_reg, markWord::biased_lock_mask_in_place); 2268 2269 cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern); 2270 beq(cr_reg, done); 2271 } 2272 2273 // allocation (for C1) 2274 void MacroAssembler::eden_allocate( 2275 Register obj, // result: pointer to object after successful allocation 2276 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2277 int con_size_in_bytes, // object size in bytes if known at compile time 2278 Register t1, // temp register 2279 Register t2, // temp register 2280 Label& slow_case // continuation point if fast allocation fails 2281 ) { 2282 b(slow_case); 2283 } 2284 2285 void MacroAssembler::tlab_allocate( 2286 Register obj, // result: pointer to object after successful allocation 2287 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2288 int con_size_in_bytes, // object size in bytes if known at compile time 2289 Register t1, // temp register 2290 Label& slow_case // continuation point if fast allocation fails 2291 ) { 2292 // make sure arguments make sense 2293 assert_different_registers(obj, var_size_in_bytes, t1); 2294 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size"); 2295 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2296 2297 const Register new_top = t1; 2298 //verify_tlab(); not implemented 2299 2300 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2301 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2302 if (var_size_in_bytes == noreg) { 2303 addi(new_top, obj, con_size_in_bytes); 2304 } else { 2305 add(new_top, obj, var_size_in_bytes); 2306 } 2307 cmpld(CCR0, new_top, R0); 2308 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2309 2310 #ifdef ASSERT 2311 // make sure new free pointer is properly aligned 2312 { 2313 Label L; 2314 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2315 beq(CCR0, L); 2316 stop("updated TLAB free is not properly aligned", 0x934); 2317 bind(L); 2318 } 2319 #endif // ASSERT 2320 2321 // update the tlab top pointer 2322 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2323 //verify_tlab(); not implemented 2324 } 2325 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2326 unimplemented("incr_allocated_bytes"); 2327 } 2328 2329 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2330 int insts_call_instruction_offset, Register Rtoc) { 2331 // Start the stub. 2332 address stub = start_a_stub(64); 2333 if (stub == NULL) { return NULL; } // CodeCache full: bail out 2334 2335 // Create a trampoline stub relocation which relates this trampoline stub 2336 // with the call instruction at insts_call_instruction_offset in the 2337 // instructions code-section. 2338 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2339 const int stub_start_offset = offset(); 2340 2341 // For java_to_interp stubs we use R11_scratch1 as scratch register 2342 // and in call trampoline stubs we use R12_scratch2. This way we 2343 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2344 Register reg_scratch = R12_scratch2; 2345 2346 // Now, create the trampoline stub's code: 2347 // - load the TOC 2348 // - load the call target from the constant pool 2349 // - call 2350 if (Rtoc == noreg) { 2351 calculate_address_from_global_toc(reg_scratch, method_toc()); 2352 Rtoc = reg_scratch; 2353 } 2354 2355 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2356 mtctr(reg_scratch); 2357 bctr(); 2358 2359 const address stub_start_addr = addr_at(stub_start_offset); 2360 2361 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2362 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2363 "encoded offset into the constant pool must match"); 2364 // Trampoline_stub_size should be good. 2365 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2366 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2367 2368 // End the stub. 2369 end_a_stub(); 2370 return stub; 2371 } 2372 2373 // TM on PPC64. 2374 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { 2375 Label retry; 2376 bind(retry); 2377 ldarx(result, addr, /*hint*/ false); 2378 addi(result, result, simm16); 2379 stdcx_(result, addr); 2380 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2381 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2382 } else { 2383 bne( CCR0, retry); // stXcx_ sets CCR0 2384 } 2385 } 2386 2387 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { 2388 Label retry; 2389 bind(retry); 2390 lwarx(result, addr, /*hint*/ false); 2391 ori(result, result, uimm16); 2392 stwcx_(result, addr); 2393 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 2394 bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 2395 } else { 2396 bne( CCR0, retry); // stXcx_ sets CCR0 2397 } 2398 } 2399 2400 #if INCLUDE_RTM_OPT 2401 2402 // Update rtm_counters based on abort status 2403 // input: abort_status 2404 // rtm_counters_Reg (RTMLockingCounters*) 2405 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { 2406 // Mapping to keep PreciseRTMLockingStatistics similar to x86. 2407 // x86 ppc (! means inverted, ? means not the same) 2408 // 0 31 Set if abort caused by XABORT instruction. 2409 // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. 2410 // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. 2411 // 3 10 Set if an internal buffer overflowed. 2412 // 4 ?12 Set if a debug breakpoint was hit. 2413 // 5 ?32 Set if an abort occurred during execution of a nested transaction. 2414 const int failure_bit[] = {tm_tabort, // Signal handler will set this too. 2415 tm_failure_persistent, 2416 tm_non_trans_cf, 2417 tm_trans_cf, 2418 tm_footprint_of, 2419 tm_failure_code, 2420 tm_transaction_level}; 2421 2422 const int num_failure_bits = sizeof(failure_bit) / sizeof(int); 2423 const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT; 2424 2425 const int bit2counter_map[][num_counters] = 2426 // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic 2427 // Inverted logic means that if a bit is set don't count it, or vice-versa. 2428 // Care must be taken when mapping bits to counters as bits for a given 2429 // counter must be mutually exclusive. Otherwise, the counter will be 2430 // incremented more than once. 2431 // counters: 2432 // 0 1 2 3 4 5 2433 // abort , persist, conflict, overflow, debug , nested bits: 2434 {{ 1 , 0 , 0 , 0 , 0 , 0 }, // abort 2435 { 0 , -1 , 0 , 0 , 0 , 0 }, // failure_persistent 2436 { 0 , 0 , 1 , 0 , 0 , 0 }, // non_trans_cf 2437 { 0 , 0 , 1 , 0 , 0 , 0 }, // trans_cf 2438 { 0 , 0 , 0 , 1 , 0 , 0 }, // footprint_of 2439 { 0 , 0 , 0 , 0 , -1 , 0 }, // failure_code = 0xD4 2440 { 0 , 0 , 0 , 0 , 0 , 1 }}; // transaction_level > 1 2441 // ... 2442 2443 // Move abort_status value to R0 and use abort_status register as a 2444 // temporary register because R0 as third operand in ld/std is treated 2445 // as base address zero (value). Likewise, R0 as second operand in addi 2446 // is problematic because it amounts to li. 2447 const Register temp_Reg = abort_status; 2448 const Register abort_status_R0 = R0; 2449 mr(abort_status_R0, abort_status); 2450 2451 // Increment total abort counter. 2452 int counters_offs = RTMLockingCounters::abort_count_offset(); 2453 ld(temp_Reg, counters_offs, rtm_counters_Reg); 2454 addi(temp_Reg, temp_Reg, 1); 2455 std(temp_Reg, counters_offs, rtm_counters_Reg); 2456 2457 // Increment specific abort counters. 2458 if (PrintPreciseRTMLockingStatistics) { 2459 2460 // #0 counter offset. 2461 int abortX_offs = RTMLockingCounters::abortX_count_offset(); 2462 2463 for (int nbit = 0; nbit < num_failure_bits; nbit++) { 2464 for (int ncounter = 0; ncounter < num_counters; ncounter++) { 2465 if (bit2counter_map[nbit][ncounter] != 0) { 2466 Label check_abort; 2467 int abort_counter_offs = abortX_offs + (ncounter << 3); 2468 2469 if (failure_bit[nbit] == tm_transaction_level) { 2470 // Don't check outer transaction, TL = 1 (bit 63). Hence only 2471 // 11 bits in the TL field are checked to find out if failure 2472 // occured in a nested transaction. This check also matches 2473 // the case when nesting_of = 1 (nesting overflow). 2474 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10); 2475 } else if (failure_bit[nbit] == tm_failure_code) { 2476 // Check failure code for trap or illegal caught in TM. 2477 // Bits 0:7 are tested as bit 7 (persistent) is copied from 2478 // tabort or treclaim source operand. 2479 // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4). 2480 rldicl(temp_Reg, abort_status_R0, 8, 56); 2481 cmpdi(CCR0, temp_Reg, 0xD4); 2482 } else { 2483 rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0); 2484 } 2485 2486 if (bit2counter_map[nbit][ncounter] == 1) { 2487 beq(CCR0, check_abort); 2488 } else { 2489 bne(CCR0, check_abort); 2490 } 2491 2492 // We don't increment atomically. 2493 ld(temp_Reg, abort_counter_offs, rtm_counters_Reg); 2494 addi(temp_Reg, temp_Reg, 1); 2495 std(temp_Reg, abort_counter_offs, rtm_counters_Reg); 2496 2497 bind(check_abort); 2498 } 2499 } 2500 } 2501 } 2502 // Restore abort_status. 2503 mr(abort_status, abort_status_R0); 2504 } 2505 2506 // Branch if (random & (count-1) != 0), count is 2^n 2507 // tmp and CR0 are killed 2508 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { 2509 mftb(tmp); 2510 andi_(tmp, tmp, count-1); 2511 bne(CCR0, brLabel); 2512 } 2513 2514 // Perform abort ratio calculation, set no_rtm bit if high ratio. 2515 // input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED 2516 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, 2517 RTMLockingCounters* rtm_counters, 2518 Metadata* method_data) { 2519 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 2520 2521 if (RTMLockingCalculationDelay > 0) { 2522 // Delay calculation. 2523 ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); 2524 cmpdi(CCR0, rtm_counters_Reg, 0); 2525 beq(CCR0, L_done); 2526 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2527 } 2528 // Abort ratio calculation only if abort_count > RTMAbortThreshold. 2529 // Aborted transactions = abort_count * 100 2530 // All transactions = total_count * RTMTotalCountIncrRate 2531 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 2532 ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); 2533 if (is_simm(RTMAbortThreshold, 16)) { // cmpdi can handle 16bit immediate only. 2534 cmpdi(CCR0, R0, RTMAbortThreshold); 2535 blt(CCR0, L_check_always_rtm2); // reload of rtm_counters_Reg not necessary 2536 } else { 2537 load_const_optimized(rtm_counters_Reg, RTMAbortThreshold); 2538 cmpd(CCR0, R0, rtm_counters_Reg); 2539 blt(CCR0, L_check_always_rtm1); // reload of rtm_counters_Reg required 2540 } 2541 mulli(R0, R0, 100); 2542 2543 const Register tmpReg = rtm_counters_Reg; 2544 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2545 mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16 2546 mulli(tmpReg, tmpReg, RTMAbortRatio); // allowable range: int16 2547 cmpd(CCR0, R0, tmpReg); 2548 blt(CCR0, L_check_always_rtm1); // jump to reload 2549 if (method_data != NULL) { 2550 // Set rtm_state to "no rtm" in MDO. 2551 // Not using a metadata relocation. Method and Class Loader are kept alive anyway. 2552 // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) 2553 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2554 atomic_ori_int(R0, tmpReg, NoRTM); 2555 } 2556 b(L_done); 2557 2558 bind(L_check_always_rtm1); 2559 load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload 2560 bind(L_check_always_rtm2); 2561 ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); 2562 int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate; 2563 if (is_simm(thresholdValue, 16)) { // cmpdi can handle 16bit immediate only. 2564 cmpdi(CCR0, tmpReg, thresholdValue); 2565 } else { 2566 load_const_optimized(R0, thresholdValue); 2567 cmpd(CCR0, tmpReg, R0); 2568 } 2569 blt(CCR0, L_done); 2570 if (method_data != NULL) { 2571 // Set rtm_state to "always rtm" in MDO. 2572 // Not using a metadata relocation. See above. 2573 load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); 2574 atomic_ori_int(R0, tmpReg, UseRTM); 2575 } 2576 bind(L_done); 2577 } 2578 2579 // Update counters and perform abort ratio calculation. 2580 // input: abort_status_Reg 2581 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, 2582 RTMLockingCounters* rtm_counters, 2583 Metadata* method_data, 2584 bool profile_rtm) { 2585 2586 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2587 // Update rtm counters based on state at abort. 2588 // Reads abort_status_Reg, updates flags. 2589 assert_different_registers(abort_status_Reg, temp_Reg); 2590 load_const_optimized(temp_Reg, (address)rtm_counters, R0); 2591 rtm_counters_update(abort_status_Reg, temp_Reg); 2592 if (profile_rtm) { 2593 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2594 rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); 2595 } 2596 } 2597 2598 // Retry on abort if abort's status indicates non-persistent failure. 2599 // inputs: retry_count_Reg 2600 // : abort_status_Reg 2601 // output: retry_count_Reg decremented by 1 2602 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, 2603 Label& retryLabel, Label* checkRetry) { 2604 Label doneRetry; 2605 2606 // Don't retry if failure is persistent. 2607 // The persistent bit is set when a (A) Disallowed operation is performed in 2608 // transactional state, like for instance trying to write the TFHAR after a 2609 // transaction is started; or when there is (B) a Nesting Overflow (too many 2610 // nested transactions); or when (C) the Footprint overflows (too many 2611 // addressess touched in TM state so there is no more space in the footprint 2612 // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a 2613 // store is performed to a given address in TM state, then once in suspended 2614 // state the same address is accessed. Failure (A) is very unlikely to occur 2615 // in the JVM. Failure (D) will never occur because Suspended state is never 2616 // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint 2617 // Overflow will set the persistent bit. 2618 rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); 2619 bne(CCR0, doneRetry); 2620 2621 // Don't retry if transaction was deliberately aborted, i.e. caused by a 2622 // tabort instruction. 2623 rldicr_(R0, abort_status_Reg, tm_tabort, 0); 2624 bne(CCR0, doneRetry); 2625 2626 // Retry if transaction aborted due to a conflict with another thread. 2627 if (checkRetry) { bind(*checkRetry); } 2628 addic_(retry_count_Reg, retry_count_Reg, -1); 2629 blt(CCR0, doneRetry); 2630 b(retryLabel); 2631 bind(doneRetry); 2632 } 2633 2634 // Spin and retry if lock is busy. 2635 // inputs: owner_addr_Reg (monitor address) 2636 // : retry_count_Reg 2637 // output: retry_count_Reg decremented by 1 2638 // CTR is killed 2639 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { 2640 Label SpinLoop, doneRetry, doRetry; 2641 addic_(retry_count_Reg, retry_count_Reg, -1); 2642 blt(CCR0, doneRetry); 2643 2644 if (RTMSpinLoopCount > 1) { 2645 li(R0, RTMSpinLoopCount); 2646 mtctr(R0); 2647 } 2648 2649 // low thread priority 2650 smt_prio_low(); 2651 bind(SpinLoop); 2652 2653 if (RTMSpinLoopCount > 1) { 2654 bdz(doRetry); 2655 ld(R0, 0, owner_addr_Reg); 2656 cmpdi(CCR0, R0, 0); 2657 bne(CCR0, SpinLoop); 2658 } 2659 2660 bind(doRetry); 2661 2662 // restore thread priority to default in userspace 2663 #ifdef LINUX 2664 smt_prio_medium_low(); 2665 #else 2666 smt_prio_medium(); 2667 #endif 2668 2669 b(retryLabel); 2670 2671 bind(doneRetry); 2672 } 2673 2674 // Use RTM for normal stack locks. 2675 // Input: objReg (object to lock) 2676 void MacroAssembler::rtm_stack_locking(ConditionRegister flag, 2677 Register obj, Register mark_word, Register tmp, 2678 Register retry_on_abort_count_Reg, 2679 RTMLockingCounters* stack_rtm_counters, 2680 Metadata* method_data, bool profile_rtm, 2681 Label& DONE_LABEL, Label& IsInflated) { 2682 assert(UseRTMForStackLocks, "why call this otherwise?"); 2683 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2684 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2685 2686 if (RTMRetryCount > 0) { 2687 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 2688 bind(L_rtm_retry); 2689 } 2690 andi_(R0, mark_word, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 2691 bne(CCR0, IsInflated); 2692 2693 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2694 Label L_noincrement; 2695 if (RTMTotalCountIncrRate > 1) { 2696 branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement); 2697 } 2698 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 2699 load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); 2700 //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically 2701 ldx(mark_word, tmp); 2702 addi(mark_word, mark_word, 1); 2703 stdx(mark_word, tmp); 2704 bind(L_noincrement); 2705 } 2706 tbegin_(); 2707 beq(CCR0, L_on_abort); 2708 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. 2709 andi(R0, mark_word, markWord::biased_lock_mask_in_place); // look at 3 lock bits 2710 cmpwi(flag, R0, markWord::unlocked_value); // bits = 001 unlocked 2711 beq(flag, DONE_LABEL); // all done if unlocked 2712 2713 if (UseRTMXendForLockBusy) { 2714 tend_(); 2715 b(L_decrement_retry); 2716 } else { 2717 tabort_(); 2718 } 2719 bind(L_on_abort); 2720 const Register abort_status_Reg = tmp; 2721 mftexasr(abort_status_Reg); 2722 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2723 rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); 2724 } 2725 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload 2726 if (RTMRetryCount > 0) { 2727 // Retry on lock abort if abort status is not permanent. 2728 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); 2729 } else { 2730 bind(L_decrement_retry); 2731 } 2732 } 2733 2734 // Use RTM for inflating locks 2735 // inputs: obj (object to lock) 2736 // mark_word (current header - KILLED) 2737 // boxReg (on-stack box address (displaced header location) - KILLED) 2738 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, 2739 Register obj, Register mark_word, Register boxReg, 2740 Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, 2741 RTMLockingCounters* rtm_counters, 2742 Metadata* method_data, bool profile_rtm, 2743 Label& DONE_LABEL) { 2744 assert(UseRTMLocking, "why call this otherwise?"); 2745 Label L_rtm_retry, L_decrement_retry, L_on_abort; 2746 // Clean monitor_value bit to get valid pointer. 2747 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value; 2748 2749 // Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark(). 2750 std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); 2751 const Register tmpReg = boxReg; 2752 const Register owner_addr_Reg = mark_word; 2753 addi(owner_addr_Reg, mark_word, owner_offset); 2754 2755 if (RTMRetryCount > 0) { 2756 load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. 2757 load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. 2758 bind(L_rtm_retry); 2759 } 2760 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2761 Label L_noincrement; 2762 if (RTMTotalCountIncrRate > 1) { 2763 branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement); 2764 } 2765 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 2766 load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); 2767 //atomic_inc_ptr(R0, tmpReg); We don't increment atomically 2768 ldx(tmpReg, R0); 2769 addi(tmpReg, tmpReg, 1); 2770 stdx(tmpReg, R0); 2771 bind(L_noincrement); 2772 } 2773 tbegin_(); 2774 beq(CCR0, L_on_abort); 2775 // We don't reload mark word. Will only be reset at safepoint. 2776 ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. 2777 cmpdi(flag, R0, 0); 2778 beq(flag, DONE_LABEL); 2779 2780 if (UseRTMXendForLockBusy) { 2781 tend_(); 2782 b(L_decrement_retry); 2783 } else { 2784 tabort_(); 2785 } 2786 bind(L_on_abort); 2787 const Register abort_status_Reg = tmpReg; 2788 mftexasr(abort_status_Reg); 2789 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 2790 rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); 2791 // Restore owner_addr_Reg 2792 ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); 2793 #ifdef ASSERT 2794 andi_(R0, mark_word, markWord::monitor_value); 2795 asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. 2796 #endif 2797 addi(owner_addr_Reg, mark_word, owner_offset); 2798 } 2799 if (RTMRetryCount > 0) { 2800 // Retry on lock abort if abort status is not permanent. 2801 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 2802 } 2803 2804 // Appears unlocked - try to swing _owner from null to non-null. 2805 cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, 2806 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2807 MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); 2808 2809 if (RTMRetryCount > 0) { 2810 // success done else retry 2811 b(DONE_LABEL); 2812 bind(L_decrement_retry); 2813 // Spin and retry if lock is busy. 2814 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); 2815 } else { 2816 bind(L_decrement_retry); 2817 } 2818 } 2819 2820 #endif // INCLUDE_RTM_OPT 2821 2822 // "The box" is the space on the stack where we copy the object mark. 2823 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2824 Register temp, Register displaced_header, Register current_header, 2825 bool try_bias, 2826 RTMLockingCounters* rtm_counters, 2827 RTMLockingCounters* stack_rtm_counters, 2828 Metadata* method_data, 2829 bool use_rtm, bool profile_rtm) { 2830 assert_different_registers(oop, box, temp, displaced_header, current_header); 2831 assert(flag != CCR0, "bad condition register"); 2832 Label cont; 2833 Label object_has_monitor; 2834 Label cas_failed; 2835 2836 // Load markWord from object into displaced_header. 2837 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2838 2839 2840 if (try_bias) { 2841 biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); 2842 } 2843 2844 #if INCLUDE_RTM_OPT 2845 if (UseRTMForStackLocks && use_rtm) { 2846 rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, 2847 stack_rtm_counters, method_data, profile_rtm, 2848 cont, object_has_monitor); 2849 } 2850 #endif // INCLUDE_RTM_OPT 2851 2852 // Handle existing monitor. 2853 // The object has an existing monitor iff (mark & monitor_value) != 0. 2854 andi_(temp, displaced_header, markWord::monitor_value); 2855 bne(CCR0, object_has_monitor); 2856 2857 // Set displaced_header to be (markWord of object | UNLOCK_VALUE). 2858 ori(displaced_header, displaced_header, markWord::unlocked_value); 2859 2860 // Load Compare Value application register. 2861 2862 // Initialize the box. (Must happen before we update the object mark!) 2863 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2864 2865 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2866 // Compare object markWord with mark and if equal exchange scratch1 with object markWord. 2867 cmpxchgd(/*flag=*/flag, 2868 /*current_value=*/current_header, 2869 /*compare_value=*/displaced_header, 2870 /*exchange_value=*/box, 2871 /*where=*/oop, 2872 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2873 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2874 noreg, 2875 &cas_failed, 2876 /*check without membar and ldarx first*/true); 2877 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2878 2879 // If the compare-and-exchange succeeded, then we found an unlocked 2880 // object and we have now locked it. 2881 b(cont); 2882 2883 bind(cas_failed); 2884 // We did not see an unlocked object so try the fast recursive case. 2885 2886 // Check if the owner is self by comparing the value in the markWord of object 2887 // (current_header) with the stack pointer. 2888 sub(current_header, current_header, R1_SP); 2889 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place); 2890 2891 and_(R0/*==0?*/, current_header, temp); 2892 // If condition is true we are cont and hence we can store 0 as the 2893 // displaced header in the box, which indicates that it is a recursive lock. 2894 mcrf(flag,CCR0); 2895 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2896 2897 // Handle existing monitor. 2898 b(cont); 2899 2900 bind(object_has_monitor); 2901 // The object's monitor m is unlocked iff m->owner == NULL, 2902 // otherwise m->owner may contain a thread or a stack address. 2903 2904 #if INCLUDE_RTM_OPT 2905 // Use the same RTM locking code in 32- and 64-bit VM. 2906 if (use_rtm) { 2907 rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, 2908 rtm_counters, method_data, profile_rtm, cont); 2909 } else { 2910 #endif // INCLUDE_RTM_OPT 2911 2912 // Try to CAS m->owner from NULL to current thread. 2913 addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value); 2914 cmpxchgd(/*flag=*/flag, 2915 /*current_value=*/current_header, 2916 /*compare_value=*/(intptr_t)0, 2917 /*exchange_value=*/R16_thread, 2918 /*where=*/temp, 2919 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2920 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2921 2922 // Store a non-null value into the box. 2923 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2924 2925 # ifdef ASSERT 2926 bne(flag, cont); 2927 // We have acquired the monitor, check some invariants. 2928 addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes()); 2929 // Invariant 1: _recursions should be 0. 2930 //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size"); 2931 asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp, 2932 "monitor->_recursions should be 0", -1); 2933 # endif 2934 2935 #if INCLUDE_RTM_OPT 2936 } // use_rtm() 2937 #endif 2938 2939 bind(cont); 2940 // flag == EQ indicates success 2941 // flag == NE indicates failure 2942 } 2943 2944 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2945 Register temp, Register displaced_header, Register current_header, 2946 bool try_bias, bool use_rtm) { 2947 assert_different_registers(oop, box, temp, displaced_header, current_header); 2948 assert(flag != CCR0, "bad condition register"); 2949 Label cont; 2950 Label object_has_monitor; 2951 2952 if (try_bias) { 2953 biased_locking_exit(flag, oop, current_header, cont); 2954 } 2955 2956 #if INCLUDE_RTM_OPT 2957 if (UseRTMForStackLocks && use_rtm) { 2958 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 2959 Label L_regular_unlock; 2960 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword 2961 andi(R0, current_header, markWord::biased_lock_mask_in_place); // look at 3 lock bits 2962 cmpwi(flag, R0, markWord::unlocked_value); // bits = 001 unlocked 2963 bne(flag, L_regular_unlock); // else RegularLock 2964 tend_(); // otherwise end... 2965 b(cont); // ... and we're done 2966 bind(L_regular_unlock); 2967 } 2968 #endif 2969 2970 // Find the lock address and load the displaced header from the stack. 2971 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2972 2973 // If the displaced header is 0, we have a recursive unlock. 2974 cmpdi(flag, displaced_header, 0); 2975 beq(flag, cont); 2976 2977 // Handle existing monitor. 2978 // The object has an existing monitor iff (mark & monitor_value) != 0. 2979 RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done 2980 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2981 andi_(R0, current_header, markWord::monitor_value); 2982 bne(CCR0, object_has_monitor); 2983 2984 // Check if it is still a light weight lock, this is is true if we see 2985 // the stack address of the basicLock in the markWord of the object. 2986 // Cmpxchg sets flag to cmpd(current_header, box). 2987 cmpxchgd(/*flag=*/flag, 2988 /*current_value=*/current_header, 2989 /*compare_value=*/box, 2990 /*exchange_value=*/displaced_header, 2991 /*where=*/oop, 2992 MacroAssembler::MemBarRel, 2993 MacroAssembler::cmpxchgx_hint_release_lock(), 2994 noreg, 2995 &cont); 2996 2997 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2998 2999 // Handle existing monitor. 3000 b(cont); 3001 3002 bind(object_has_monitor); 3003 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 3004 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor 3005 ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 3006 3007 // It's inflated. 3008 #if INCLUDE_RTM_OPT 3009 if (use_rtm) { 3010 Label L_regular_inflated_unlock; 3011 // Clean monitor_value bit to get valid pointer 3012 cmpdi(flag, temp, 0); 3013 bne(flag, L_regular_inflated_unlock); 3014 tend_(); 3015 b(cont); 3016 bind(L_regular_inflated_unlock); 3017 } 3018 #endif 3019 3020 ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); 3021 xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. 3022 orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. 3023 cmpdi(flag, temp, 0); 3024 bne(flag, cont); 3025 3026 ld(temp, ObjectMonitor::EntryList_offset_in_bytes(), current_header); 3027 ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header); 3028 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 3029 cmpdi(flag, temp, 0); 3030 bne(flag, cont); 3031 release(); 3032 std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); 3033 3034 bind(cont); 3035 // flag == EQ indicates success 3036 // flag == NE indicates failure 3037 } 3038 3039 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) { 3040 if (SafepointMechanism::uses_thread_local_poll()) { 3041 ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread); 3042 // Armed page has poll_bit set. 3043 andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit()); 3044 } else { 3045 lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state()); 3046 cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized); 3047 } 3048 bne(CCR0, slow_path); 3049 } 3050 3051 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) { 3052 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3053 bs->resolve_jobject(this, value, tmp1, tmp2, needs_frame); 3054 } 3055 3056 // Values for last_Java_pc, and last_Java_sp must comply to the rules 3057 // in frame_ppc.hpp. 3058 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 3059 // Always set last_Java_pc and flags first because once last_Java_sp 3060 // is visible has_last_Java_frame is true and users will look at the 3061 // rest of the fields. (Note: flags should always be zero before we 3062 // get here so doesn't need to be set.) 3063 3064 // Verify that last_Java_pc was zeroed on return to Java 3065 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 3066 "last_Java_pc not zeroed before leaving Java", 0x200); 3067 3068 // When returning from calling out from Java mode the frame anchor's 3069 // last_Java_pc will always be set to NULL. It is set here so that 3070 // if we are doing a call to native (not VM) that we capture the 3071 // known pc and don't have to rely on the native call having a 3072 // standard frame linkage where we can find the pc. 3073 if (last_Java_pc != noreg) 3074 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3075 3076 // Set last_Java_sp last. 3077 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3078 } 3079 3080 void MacroAssembler::reset_last_Java_frame(void) { 3081 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 3082 R16_thread, "SP was not set, still zero", 0x202); 3083 3084 BLOCK_COMMENT("reset_last_Java_frame {"); 3085 li(R0, 0); 3086 3087 // _last_Java_sp = 0 3088 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 3089 3090 // _last_Java_pc = 0 3091 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 3092 BLOCK_COMMENT("} reset_last_Java_frame"); 3093 } 3094 3095 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 3096 assert_different_registers(sp, tmp1); 3097 3098 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 3099 // TOP_IJAVA_FRAME_ABI. 3100 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 3101 address entry = pc(); 3102 load_const_optimized(tmp1, entry); 3103 3104 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 3105 } 3106 3107 void MacroAssembler::get_vm_result(Register oop_result) { 3108 // Read: 3109 // R16_thread 3110 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3111 // 3112 // Updated: 3113 // oop_result 3114 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 3115 3116 verify_thread(); 3117 3118 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3119 li(R0, 0); 3120 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 3121 3122 verify_oop(oop_result); 3123 } 3124 3125 void MacroAssembler::get_vm_result_2(Register metadata_result) { 3126 // Read: 3127 // R16_thread 3128 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3129 // 3130 // Updated: 3131 // metadata_result 3132 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 3133 3134 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3135 li(R0, 0); 3136 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 3137 } 3138 3139 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3140 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 3141 if (CompressedKlassPointers::base() != 0) { 3142 // Use dst as temp if it is free. 3143 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0); 3144 current = dst; 3145 } 3146 if (CompressedKlassPointers::shift() != 0) { 3147 srdi(dst, current, CompressedKlassPointers::shift()); 3148 current = dst; 3149 } 3150 return current; 3151 } 3152 3153 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 3154 if (UseCompressedClassPointers) { 3155 Register compressedKlass = encode_klass_not_null(ck, klass); 3156 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 3157 } else { 3158 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 3159 } 3160 } 3161 3162 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 3163 if (UseCompressedClassPointers) { 3164 if (val == noreg) { 3165 val = R0; 3166 li(val, 0); 3167 } 3168 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 3169 } 3170 } 3171 3172 int MacroAssembler::instr_size_for_decode_klass_not_null() { 3173 if (!UseCompressedClassPointers) return 0; 3174 int num_instrs = 1; // shift or move 3175 if (CompressedKlassPointers::base() != 0) num_instrs = 7; // shift + load const + add 3176 return num_instrs * BytesPerInstWord; 3177 } 3178 3179 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3180 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 3181 if (src == noreg) src = dst; 3182 Register shifted_src = src; 3183 if (CompressedKlassPointers::shift() != 0 || 3184 CompressedKlassPointers::base() == 0 && src != dst) { // Move required. 3185 shifted_src = dst; 3186 sldi(shifted_src, src, CompressedKlassPointers::shift()); 3187 } 3188 if (CompressedKlassPointers::base() != 0) { 3189 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0); 3190 } 3191 } 3192 3193 void MacroAssembler::load_klass(Register dst, Register src) { 3194 if (UseCompressedClassPointers) { 3195 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 3196 // Attention: no null check here! 3197 decode_klass_not_null(dst, dst); 3198 } else { 3199 ld(dst, oopDesc::klass_offset_in_bytes(), src); 3200 } 3201 } 3202 3203 // ((OopHandle)result).resolve(); 3204 void MacroAssembler::resolve_oop_handle(Register result) { 3205 // OopHandle::resolve is an indirection. 3206 ld(result, 0, result); 3207 } 3208 3209 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) { 3210 ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method); 3211 ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror); 3212 ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror); 3213 resolve_oop_handle(mirror); 3214 } 3215 3216 void MacroAssembler::load_method_holder(Register holder, Register method) { 3217 ld(holder, in_bytes(Method::const_offset()), method); 3218 ld(holder, in_bytes(ConstMethod::constants_offset()), holder); 3219 ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder); 3220 } 3221 3222 // Clear Array 3223 // For very short arrays. tmp == R0 is allowed. 3224 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 3225 if (cnt_dwords > 0) { li(tmp, 0); } 3226 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 3227 } 3228 3229 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 3230 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 3231 if (cnt_dwords < 8) { 3232 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 3233 return; 3234 } 3235 3236 Label loop; 3237 const long loopcnt = cnt_dwords >> 1, 3238 remainder = cnt_dwords & 1; 3239 3240 li(tmp, loopcnt); 3241 mtctr(tmp); 3242 li(tmp, 0); 3243 bind(loop); 3244 std(tmp, 0, base_ptr); 3245 std(tmp, 8, base_ptr); 3246 addi(base_ptr, base_ptr, 16); 3247 bdnz(loop); 3248 if (remainder) { std(tmp, 0, base_ptr); } 3249 } 3250 3251 // Kills both input registers. tmp == R0 is allowed. 3252 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 3253 // Procedure for large arrays (uses data cache block zero instruction). 3254 Label startloop, fast, fastloop, small_rest, restloop, done; 3255 const int cl_size = VM_Version::L1_data_cache_line_size(), 3256 cl_dwords = cl_size >> 3, 3257 cl_dw_addr_bits = exact_log2(cl_dwords), 3258 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 3259 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 3260 3261 if (const_cnt >= 0) { 3262 // Constant case. 3263 if (const_cnt < min_cnt) { 3264 clear_memory_constlen(base_ptr, const_cnt, tmp); 3265 return; 3266 } 3267 load_const_optimized(cnt_dwords, const_cnt, tmp); 3268 } else { 3269 // cnt_dwords already loaded in register. Need to check size. 3270 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3271 blt(CCR1, small_rest); 3272 } 3273 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3274 beq(CCR0, fast); // Already 128byte aligned. 3275 3276 subfic(tmp, tmp, cl_dwords); 3277 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3278 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3279 li(tmp, 0); 3280 3281 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3282 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3283 addi(base_ptr, base_ptr, 8); 3284 bdnz(startloop); 3285 3286 bind(fast); // Clear 128byte blocks. 3287 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3288 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3289 mtctr(tmp); // Load counter. 3290 3291 bind(fastloop); 3292 dcbz(base_ptr); // Clear 128byte aligned block. 3293 addi(base_ptr, base_ptr, cl_size); 3294 bdnz(fastloop); 3295 3296 bind(small_rest); 3297 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3298 beq(CCR0, done); // rest == 0 3299 li(tmp, 0); 3300 mtctr(cnt_dwords); // Load counter. 3301 3302 bind(restloop); // Clear rest. 3303 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3304 addi(base_ptr, base_ptr, 8); 3305 bdnz(restloop); 3306 3307 bind(done); 3308 } 3309 3310 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3311 3312 #ifdef COMPILER2 3313 // Intrinsics for CompactStrings 3314 3315 // Compress char[] to byte[] by compressing 16 bytes at once. 3316 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt, 3317 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, 3318 Label& Lfailure) { 3319 3320 const Register tmp0 = R0; 3321 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3322 Label Lloop, Lslow; 3323 3324 // Check if cnt >= 8 (= 16 bytes) 3325 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF00FF00FF 3326 srwi_(tmp2, cnt, 3); 3327 beq(CCR0, Lslow); 3328 ori(tmp1, tmp1, 0xFF); 3329 rldimi(tmp1, tmp1, 32, 0); 3330 mtctr(tmp2); 3331 3332 // 2x unrolled loop 3333 bind(Lloop); 3334 ld(tmp2, 0, src); // _0_1_2_3 (Big Endian) 3335 ld(tmp4, 8, src); // _4_5_6_7 3336 3337 orr(tmp0, tmp2, tmp4); 3338 rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2 3339 rldimi(tmp2, tmp2, 2*8, 2*8); // _0_2_3_3 3340 rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6 3341 rldimi(tmp4, tmp4, 2*8, 2*8); // _4_6_7_7 3342 3343 andc_(tmp0, tmp0, tmp1); 3344 bne(CCR0, Lfailure); // Not latin1. 3345 addi(src, src, 16); 3346 3347 rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3 3348 srdi(tmp2, tmp2, 3*8); // ____0_2_ 3349 rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7 3350 srdi(tmp4, tmp4, 3*8); // ____4_6_ 3351 3352 orr(tmp2, tmp2, tmp3); // ____0123 3353 orr(tmp4, tmp4, tmp5); // ____4567 3354 3355 stw(tmp2, 0, dst); 3356 stw(tmp4, 4, dst); 3357 addi(dst, dst, 8); 3358 bdnz(Lloop); 3359 3360 bind(Lslow); // Fallback to slow version 3361 } 3362 3363 // Compress char[] to byte[]. cnt must be positive int. 3364 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) { 3365 Label Lloop; 3366 mtctr(cnt); 3367 3368 bind(Lloop); 3369 lhz(tmp, 0, src); 3370 cmplwi(CCR0, tmp, 0xff); 3371 bgt(CCR0, Lfailure); // Not latin1. 3372 addi(src, src, 2); 3373 stb(tmp, 0, dst); 3374 addi(dst, dst, 1); 3375 bdnz(Lloop); 3376 } 3377 3378 // Inflate byte[] to char[] by inflating 16 bytes at once. 3379 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt, 3380 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 3381 const Register tmp0 = R0; 3382 assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); 3383 Label Lloop, Lslow; 3384 3385 // Check if cnt >= 8 3386 srwi_(tmp2, cnt, 3); 3387 beq(CCR0, Lslow); 3388 lis(tmp1, 0xFF); // tmp1 = 0x00FF00FF 3389 ori(tmp1, tmp1, 0xFF); 3390 mtctr(tmp2); 3391 3392 // 2x unrolled loop 3393 bind(Lloop); 3394 lwz(tmp2, 0, src); // ____0123 (Big Endian) 3395 lwz(tmp4, 4, src); // ____4567 3396 addi(src, src, 8); 3397 3398 rldicl(tmp3, tmp2, 7*8, 64-8); // _______2 3399 rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113 3400 rldicl(tmp5, tmp4, 7*8, 64-8); // _______6 3401 rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557 3402 3403 andc(tmp0, tmp2, tmp1); // ____0_1_ 3404 rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3 3405 andc(tmp3, tmp4, tmp1); // ____4_5_ 3406 rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7 3407 3408 rldimi(tmp2, tmp0, 3*8, 0*8); // _0_1_2_3 3409 rldimi(tmp4, tmp3, 3*8, 0*8); // _4_5_6_7 3410 3411 std(tmp2, 0, dst); 3412 std(tmp4, 8, dst); 3413 addi(dst, dst, 16); 3414 bdnz(Lloop); 3415 3416 bind(Lslow); // Fallback to slow version 3417 } 3418 3419 // Inflate byte[] to char[]. cnt must be positive int. 3420 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) { 3421 Label Lloop; 3422 mtctr(cnt); 3423 3424 bind(Lloop); 3425 lbz(tmp, 0, src); 3426 addi(src, src, 1); 3427 sth(tmp, 0, dst); 3428 addi(dst, dst, 2); 3429 bdnz(Lloop); 3430 } 3431 3432 void MacroAssembler::string_compare(Register str1, Register str2, 3433 Register cnt1, Register cnt2, 3434 Register tmp1, Register result, int ae) { 3435 const Register tmp0 = R0, 3436 diff = tmp1; 3437 3438 assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result); 3439 Label Ldone, Lslow, Lloop, Lreturn_diff; 3440 3441 // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a) 3442 // we interchange str1 and str2 in the UL case and negate the result. 3443 // Like this, str1 is always latin1 encoded, except for the UU case. 3444 // In addition, we need 0 (or sign which is 0) extend. 3445 3446 if (ae == StrIntrinsicNode::UU) { 3447 srwi(cnt1, cnt1, 1); 3448 } else { 3449 clrldi(cnt1, cnt1, 32); 3450 } 3451 3452 if (ae != StrIntrinsicNode::LL) { 3453 srwi(cnt2, cnt2, 1); 3454 } else { 3455 clrldi(cnt2, cnt2, 32); 3456 } 3457 3458 // See if the lengths are different, and calculate min in cnt1. 3459 // Save diff in case we need it for a tie-breaker. 3460 subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2 3461 // if (diff > 0) { cnt1 = cnt2; } 3462 if (VM_Version::has_isel()) { 3463 isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2); 3464 } else { 3465 Label Lskip; 3466 blt(CCR0, Lskip); 3467 mr(cnt1, cnt2); 3468 bind(Lskip); 3469 } 3470 3471 // Rename registers 3472 Register chr1 = result; 3473 Register chr2 = tmp0; 3474 3475 // Compare multiple characters in fast loop (only implemented for same encoding). 3476 int stride1 = 8, stride2 = 8; 3477 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 3478 int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2; 3479 Label Lfastloop, Lskipfast; 3480 3481 srwi_(tmp0, cnt1, log2_chars_per_iter); 3482 beq(CCR0, Lskipfast); 3483 rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters. 3484 li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration. 3485 mtctr(tmp0); 3486 3487 bind(Lfastloop); 3488 ld(chr1, 0, str1); 3489 ld(chr2, 0, str2); 3490 cmpd(CCR0, chr1, chr2); 3491 bne(CCR0, Lslow); 3492 addi(str1, str1, stride1); 3493 addi(str2, str2, stride2); 3494 bdnz(Lfastloop); 3495 mr(cnt1, cnt2); // Remaining characters. 3496 bind(Lskipfast); 3497 } 3498 3499 // Loop which searches the first difference character by character. 3500 cmpwi(CCR0, cnt1, 0); 3501 beq(CCR0, Lreturn_diff); 3502 bind(Lslow); 3503 mtctr(cnt1); 3504 3505 switch (ae) { 3506 case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break; 3507 case StrIntrinsicNode::UL: // fallthru (see comment above) 3508 case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break; 3509 case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break; 3510 default: ShouldNotReachHere(); break; 3511 } 3512 3513 bind(Lloop); 3514 if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); } 3515 if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); } 3516 subf_(result, chr2, chr1); // result = chr1 - chr2 3517 bne(CCR0, Ldone); 3518 addi(str1, str1, stride1); 3519 addi(str2, str2, stride2); 3520 bdnz(Lloop); 3521 3522 // If strings are equal up to min length, return the length difference. 3523 bind(Lreturn_diff); 3524 mr(result, diff); 3525 3526 // Otherwise, return the difference between the first mismatched chars. 3527 bind(Ldone); 3528 if (ae == StrIntrinsicNode::UL) { 3529 neg(result, result); // Negate result (see note above). 3530 } 3531 } 3532 3533 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2, 3534 Register limit, Register tmp1, Register result, bool is_byte) { 3535 const Register tmp0 = R0; 3536 assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result); 3537 Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast; 3538 bool limit_needs_shift = false; 3539 3540 if (is_array_equ) { 3541 const int length_offset = arrayOopDesc::length_offset_in_bytes(); 3542 const int base_offset = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR); 3543 3544 // Return true if the same array. 3545 cmpd(CCR0, ary1, ary2); 3546 beq(CCR0, Lskiploop); 3547 3548 // Return false if one of them is NULL. 3549 cmpdi(CCR0, ary1, 0); 3550 cmpdi(CCR1, ary2, 0); 3551 li(result, 0); 3552 cror(CCR0, Assembler::equal, CCR1, Assembler::equal); 3553 beq(CCR0, Ldone); 3554 3555 // Load the lengths of arrays. 3556 lwz(limit, length_offset, ary1); 3557 lwz(tmp0, length_offset, ary2); 3558 3559 // Return false if the two arrays are not equal length. 3560 cmpw(CCR0, limit, tmp0); 3561 bne(CCR0, Ldone); 3562 3563 // Load array addresses. 3564 addi(ary1, ary1, base_offset); 3565 addi(ary2, ary2, base_offset); 3566 } else { 3567 limit_needs_shift = !is_byte; 3568 li(result, 0); // Assume not equal. 3569 } 3570 3571 // Rename registers 3572 Register chr1 = tmp0; 3573 Register chr2 = tmp1; 3574 3575 // Compare 8 bytes per iteration in fast loop. 3576 const int log2_chars_per_iter = is_byte ? 3 : 2; 3577 3578 srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0)); 3579 beq(CCR0, Lskipfast); 3580 mtctr(tmp0); 3581 3582 bind(Lfastloop); 3583 ld(chr1, 0, ary1); 3584 ld(chr2, 0, ary2); 3585 addi(ary1, ary1, 8); 3586 addi(ary2, ary2, 8); 3587 cmpd(CCR0, chr1, chr2); 3588 bne(CCR0, Ldone); 3589 bdnz(Lfastloop); 3590 3591 bind(Lskipfast); 3592 rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters. 3593 beq(CCR0, Lskiploop); 3594 mtctr(limit); 3595 3596 // Character by character. 3597 bind(Lloop); 3598 if (is_byte) { 3599 lbz(chr1, 0, ary1); 3600 lbz(chr2, 0, ary2); 3601 addi(ary1, ary1, 1); 3602 addi(ary2, ary2, 1); 3603 } else { 3604 lhz(chr1, 0, ary1); 3605 lhz(chr2, 0, ary2); 3606 addi(ary1, ary1, 2); 3607 addi(ary2, ary2, 2); 3608 } 3609 cmpw(CCR0, chr1, chr2); 3610 bne(CCR0, Ldone); 3611 bdnz(Lloop); 3612 3613 bind(Lskiploop); 3614 li(result, 1); // All characters are equal. 3615 bind(Ldone); 3616 } 3617 3618 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt, 3619 Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval, 3620 Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) { 3621 3622 // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite! 3623 Label L_TooShort, L_Found, L_NotFound, L_End; 3624 Register last_addr = haycnt, // Kill haycnt at the beginning. 3625 addr = tmp1, 3626 n_start = tmp2, 3627 ch1 = tmp3, 3628 ch2 = R0; 3629 3630 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 3631 const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2; 3632 const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1; 3633 3634 // ************************************************************************************************** 3635 // Prepare for main loop: optimized for needle count >=2, bail out otherwise. 3636 // ************************************************************************************************** 3637 3638 // Compute last haystack addr to use if no match gets found. 3639 clrldi(haycnt, haycnt, 32); // Ensure positive int is valid as 64 bit value. 3640 addi(addr, haystack, -h_csize); // Accesses use pre-increment. 3641 if (needlecntval == 0) { // variable needlecnt 3642 cmpwi(CCR6, needlecnt, 2); 3643 clrldi(needlecnt, needlecnt, 32); // Ensure positive int is valid as 64 bit value. 3644 blt(CCR6, L_TooShort); // Variable needlecnt: handle short needle separately. 3645 } 3646 3647 if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle. 3648 3649 if (needlecntval == 0) { // variable needlecnt 3650 subf(ch1, needlecnt, haycnt); // Last character index to compare is haycnt-needlecnt. 3651 addi(needlecnt, needlecnt, -2); // Rest of needle. 3652 } else { // constant needlecnt 3653 guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately"); 3654 assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate"); 3655 addi(ch1, haycnt, -needlecntval); // Last character index to compare is haycnt-needlecnt. 3656 if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle. 3657 } 3658 3659 if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes. 3660 3661 if (ae ==StrIntrinsicNode::UL) { 3662 srwi(tmp4, n_start, 1*8); // ___0 3663 rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1 3664 } 3665 3666 add(last_addr, haystack, ch1); // Point to last address to compare (haystack+2*(haycnt-needlecnt)). 3667 3668 // Main Loop (now we have at least 2 characters). 3669 Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2; 3670 bind(L_OuterLoop); // Search for 1st 2 characters. 3671 Register addr_diff = tmp4; 3672 subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check. 3673 addi(addr, addr, h_csize); // This is the new address we want to use for comparing. 3674 srdi_(ch2, addr_diff, h_csize); 3675 beq(CCR0, L_FinalCheck); // 2 characters left? 3676 mtctr(ch2); // num of characters / 2 3677 bind(L_InnerLoop); // Main work horse (2x unrolled search loop) 3678 if (h_csize == 2) { // Load 2 characters of haystack (ignore alignment). 3679 lwz(ch1, 0, addr); 3680 lwz(ch2, 2, addr); 3681 } else { 3682 lhz(ch1, 0, addr); 3683 lhz(ch2, 1, addr); 3684 } 3685 cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop). 3686 cmpw(CCR1, ch2, n_start); 3687 beq(CCR0, L_Comp1); // Did we find the needle start? 3688 beq(CCR1, L_Comp2); 3689 addi(addr, addr, 2 * h_csize); 3690 bdnz(L_InnerLoop); 3691 bind(L_FinalCheck); 3692 andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1. 3693 beq(CCR0, L_NotFound); 3694 if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare. 3695 cmpw(CCR1, ch1, n_start); 3696 beq(CCR1, L_Comp1); 3697 bind(L_NotFound); 3698 li(result, -1); // not found 3699 b(L_End); 3700 3701 // ************************************************************************************************** 3702 // Special Case: unfortunately, the variable needle case can be called with needlecnt<2 3703 // ************************************************************************************************** 3704 if (needlecntval == 0) { // We have to handle these cases separately. 3705 Label L_OneCharLoop; 3706 bind(L_TooShort); 3707 mtctr(haycnt); 3708 if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle 3709 bind(L_OneCharLoop); 3710 if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); } 3711 cmpw(CCR1, ch1, n_start); 3712 beq(CCR1, L_Found); // Did we find the one character needle? 3713 bdnz(L_OneCharLoop); 3714 li(result, -1); // Not found. 3715 b(L_End); 3716 } 3717 3718 // ************************************************************************************************** 3719 // Regular Case Part II: compare rest of needle (first 2 characters have been compared already) 3720 // ************************************************************************************************** 3721 3722 // Compare the rest 3723 bind(L_Comp2); 3724 addi(addr, addr, h_csize); // First comparison has failed, 2nd one hit. 3725 bind(L_Comp1); // Addr points to possible needle start. 3726 if (needlecntval != 2) { // Const needlecnt==2? 3727 if (needlecntval != 3) { 3728 if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2? 3729 Register n_ind = tmp4, 3730 h_ind = n_ind; 3731 li(n_ind, 2 * n_csize); // First 2 characters are already compared, use index 2. 3732 mtctr(needlecnt); // Decremented by 2, still > 0. 3733 Label L_CompLoop; 3734 bind(L_CompLoop); 3735 if (ae ==StrIntrinsicNode::UL) { 3736 h_ind = ch1; 3737 sldi(h_ind, n_ind, 1); 3738 } 3739 if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); } 3740 if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); } 3741 cmpw(CCR1, ch1, ch2); 3742 bne(CCR1, L_OuterLoop); 3743 addi(n_ind, n_ind, n_csize); 3744 bdnz(L_CompLoop); 3745 } else { // No loop required if there's only one needle character left. 3746 if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); } 3747 if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); } 3748 cmpw(CCR1, ch1, ch2); 3749 bne(CCR1, L_OuterLoop); 3750 } 3751 } 3752 // Return index ... 3753 bind(L_Found); 3754 subf(result, haystack, addr); // relative to haystack, ... 3755 if (h_csize == 2) { srdi(result, result, 1); } // in characters. 3756 bind(L_End); 3757 } // string_indexof 3758 3759 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt, 3760 Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) { 3761 assert_different_registers(haystack, haycnt, needle, tmp1, tmp2); 3762 3763 Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End; 3764 Register addr = tmp1, 3765 ch1 = tmp2, 3766 ch2 = R0; 3767 3768 const int h_csize = is_byte ? 1 : 2; 3769 3770 //4: 3771 srwi_(tmp2, haycnt, 1); // Shift right by exact_log2(UNROLL_FACTOR). 3772 mr(addr, haystack); 3773 beq(CCR0, L_FinalCheck); 3774 mtctr(tmp2); // Move to count register. 3775 //8: 3776 bind(L_InnerLoop); // Main work horse (2x unrolled search loop). 3777 if (!is_byte) { 3778 lhz(ch1, 0, addr); 3779 lhz(ch2, 2, addr); 3780 } else { 3781 lbz(ch1, 0, addr); 3782 lbz(ch2, 1, addr); 3783 } 3784 (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar); 3785 (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar); 3786 beq(CCR0, L_Found1); // Did we find the needle? 3787 beq(CCR1, L_Found2); 3788 addi(addr, addr, 2 * h_csize); 3789 bdnz(L_InnerLoop); 3790 //16: 3791 bind(L_FinalCheck); 3792 andi_(R0, haycnt, 1); 3793 beq(CCR0, L_NotFound); 3794 if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare. 3795 (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar); 3796 beq(CCR1, L_Found1); 3797 //21: 3798 bind(L_NotFound); 3799 li(result, -1); // Not found. 3800 b(L_End); 3801 3802 bind(L_Found2); 3803 addi(addr, addr, h_csize); 3804 //24: 3805 bind(L_Found1); // Return index ... 3806 subf(result, haystack, addr); // relative to haystack, ... 3807 if (!is_byte) { srdi(result, result, 1); } // in characters. 3808 bind(L_End); 3809 } // string_indexof_char 3810 3811 3812 void MacroAssembler::has_negatives(Register src, Register cnt, Register result, 3813 Register tmp1, Register tmp2) { 3814 const Register tmp0 = R0; 3815 assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2); 3816 Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone; 3817 3818 // Check if cnt >= 8 (= 16 bytes) 3819 lis(tmp1, (int)(short)0x8080); // tmp1 = 0x8080808080808080 3820 srwi_(tmp2, cnt, 4); 3821 li(result, 1); // Assume there's a negative byte. 3822 beq(CCR0, Lslow); 3823 ori(tmp1, tmp1, 0x8080); 3824 rldimi(tmp1, tmp1, 32, 0); 3825 mtctr(tmp2); 3826 3827 // 2x unrolled loop 3828 bind(Lfastloop); 3829 ld(tmp2, 0, src); 3830 ld(tmp0, 8, src); 3831 3832 orr(tmp0, tmp2, tmp0); 3833 3834 and_(tmp0, tmp0, tmp1); 3835 bne(CCR0, Ldone); // Found negative byte. 3836 addi(src, src, 16); 3837 3838 bdnz(Lfastloop); 3839 3840 bind(Lslow); // Fallback to slow version 3841 rldicl_(tmp0, cnt, 0, 64-4); 3842 beq(CCR0, Lnoneg); 3843 mtctr(tmp0); 3844 bind(Lloop); 3845 lbz(tmp0, 0, src); 3846 addi(src, src, 1); 3847 andi_(tmp0, tmp0, 0x80); 3848 bne(CCR0, Ldone); // Found negative byte. 3849 bdnz(Lloop); 3850 bind(Lnoneg); 3851 li(result, 0); 3852 3853 bind(Ldone); 3854 } 3855 3856 #endif // Compiler2 3857 3858 // Helpers for Intrinsic Emitters 3859 // 3860 // Revert the byte order of a 32bit value in a register 3861 // src: 0x44556677 3862 // dst: 0x77665544 3863 // Three steps to obtain the result: 3864 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3865 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3866 // This value initializes dst. 3867 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3868 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3869 // This value is mask inserted into dst with a [0..23] mask of 1s. 3870 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3871 // This value is mask inserted into dst with a [8..15] mask of 1s. 3872 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3873 assert_different_registers(dst, src); 3874 3875 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3876 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3877 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3878 } 3879 3880 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3881 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3882 // body size from 20 to 16 instructions. 3883 // Returns the offset that was used to calculate the address of column tc3. 3884 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3885 // at hand, the original table address can be easily reconstructed. 3886 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3887 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!"); 3888 3889 // Point to 4 byte folding tables (byte-reversed version for Big Endian) 3890 // Layout: See StubRoutines::generate_crc_constants. 3891 #ifdef VM_LITTLE_ENDIAN 3892 const int ix0 = 3 * CRC32_TABLE_SIZE; 3893 const int ix1 = 2 * CRC32_TABLE_SIZE; 3894 const int ix2 = 1 * CRC32_TABLE_SIZE; 3895 const int ix3 = 0 * CRC32_TABLE_SIZE; 3896 #else 3897 const int ix0 = 1 * CRC32_TABLE_SIZE; 3898 const int ix1 = 2 * CRC32_TABLE_SIZE; 3899 const int ix2 = 3 * CRC32_TABLE_SIZE; 3900 const int ix3 = 4 * CRC32_TABLE_SIZE; 3901 #endif 3902 assert_different_registers(table, tc0, tc1, tc2); 3903 assert(table == tc3, "must be!"); 3904 3905 addi(tc0, table, ix0); 3906 addi(tc1, table, ix1); 3907 addi(tc2, table, ix2); 3908 if (ix3 != 0) addi(tc3, table, ix3); 3909 3910 return ix3; 3911 } 3912 3913 /** 3914 * uint32_t crc; 3915 * table[crc & 0xFF] ^ (crc >> 8); 3916 */ 3917 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3918 assert_different_registers(crc, table, tmp); 3919 assert_different_registers(val, table); 3920 3921 if (crc == val) { // Must rotate first to use the unmodified value. 3922 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3923 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3924 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3925 } else { 3926 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3927 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3928 } 3929 lwzx(tmp, table, tmp); 3930 xorr(crc, crc, tmp); 3931 } 3932 3933 /** 3934 * Emits code to update CRC-32 with a byte value according to constants in table. 3935 * 3936 * @param [in,out]crc Register containing the crc. 3937 * @param [in]val Register containing the byte to fold into the CRC. 3938 * @param [in]table Register containing the table of crc constants. 3939 * 3940 * uint32_t crc; 3941 * val = crc_table[(val ^ crc) & 0xFF]; 3942 * crc = val ^ (crc >> 8); 3943 */ 3944 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3945 BLOCK_COMMENT("update_byte_crc32:"); 3946 xorr(val, val, crc); 3947 fold_byte_crc32(crc, val, table, val); 3948 } 3949 3950 /** 3951 * @param crc register containing existing CRC (32-bit) 3952 * @param buf register pointing to input byte buffer (byte*) 3953 * @param len register containing number of bytes 3954 * @param table register pointing to CRC table 3955 */ 3956 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3957 Register data, bool loopAlignment) { 3958 assert_different_registers(crc, buf, len, table, data); 3959 3960 Label L_mainLoop, L_done; 3961 const int mainLoop_stepping = 1; 3962 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3963 3964 // Process all bytes in a single-byte loop. 3965 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3966 beq(CCR0, L_done); 3967 3968 mtctr(len); 3969 align(mainLoop_alignment); 3970 BIND(L_mainLoop); 3971 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3972 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3973 update_byte_crc32(crc, data, table); 3974 bdnz(L_mainLoop); // Iterate. 3975 3976 bind(L_done); 3977 } 3978 3979 /** 3980 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3981 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3982 */ 3983 // A note on the lookup table address(es): 3984 // The implementation uses 4 table columns (byte-reversed versions for Big Endian). 3985 // To save the effort of adding the column offset to the table address each time 3986 // a table element is looked up, it is possible to pass the pre-calculated 3987 // column addresses. 3988 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3989 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3990 Register t0, Register t1, Register t2, Register t3, 3991 Register tc0, Register tc1, Register tc2, Register tc3) { 3992 assert_different_registers(crc, t3); 3993 3994 // XOR crc with next four bytes of buffer. 3995 lwz(t3, bufDisp, buf); 3996 if (bufInc != 0) { 3997 addi(buf, buf, bufInc); 3998 } 3999 xorr(t3, t3, crc); 4000 4001 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 4002 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 4003 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 4004 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 4005 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 4006 4007 // Use the pre-calculated column addresses. 4008 // Load pre-calculated table values. 4009 lwzx(t0, tc0, t0); 4010 lwzx(t1, tc1, t1); 4011 lwzx(t2, tc2, t2); 4012 lwzx(t3, tc3, t3); 4013 4014 // Calculate new crc from table values. 4015 xorr(t0, t0, t1); 4016 xorr(t2, t2, t3); 4017 xorr(crc, t0, t2); // Now crc contains the final checksum value. 4018 } 4019 4020 /** 4021 * @param crc register containing existing CRC (32-bit) 4022 * @param buf register pointing to input byte buffer (byte*) 4023 * @param len register containing number of bytes 4024 * @param table register pointing to CRC table 4025 * 4026 * uses R9..R12 as work register. Must be saved/restored by caller! 4027 */ 4028 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 4029 Register t0, Register t1, Register t2, Register t3, 4030 Register tc0, Register tc1, Register tc2, Register tc3, 4031 bool invertCRC) { 4032 assert_different_registers(crc, buf, len, table); 4033 4034 Label L_mainLoop, L_tail; 4035 Register tmp = t0; 4036 Register data = t0; 4037 Register tmp2 = t1; 4038 const int mainLoop_stepping = 4; 4039 const int tailLoop_stepping = 1; 4040 const int log_stepping = exact_log2(mainLoop_stepping); 4041 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 4042 const int complexThreshold = 2*mainLoop_stepping; 4043 4044 // Don't test for len <= 0 here. This pathological case should not occur anyway. 4045 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 4046 // for all well-behaved cases. The situation itself is detected and handled correctly 4047 // within update_byteLoop_crc32. 4048 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 4049 4050 BLOCK_COMMENT("kernel_crc32_1word {"); 4051 4052 if (invertCRC) { 4053 nand(crc, crc, crc); // 1s complement of crc 4054 } 4055 4056 // Check for short (<mainLoop_stepping) buffer. 4057 cmpdi(CCR0, len, complexThreshold); 4058 blt(CCR0, L_tail); 4059 4060 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 4061 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 4062 { 4063 // Align buf addr to mainLoop_stepping boundary. 4064 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 4065 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 4066 4067 if (complexThreshold > mainLoop_stepping) { 4068 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4069 } else { 4070 sub(tmp, len, tmp2); // Remaining bytes for main loop. 4071 cmpdi(CCR0, tmp, mainLoop_stepping); 4072 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 4073 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 4074 } 4075 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 4076 } 4077 4078 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 4079 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 4080 mtctr(tmp2); 4081 4082 #ifdef VM_LITTLE_ENDIAN 4083 Register crc_rv = crc; 4084 #else 4085 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 4086 // Occupies tmp, but frees up crc. 4087 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 4088 tmp = crc; 4089 #endif 4090 4091 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 4092 4093 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 4094 BIND(L_mainLoop); 4095 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 4096 bdnz(L_mainLoop); 4097 4098 #ifndef VM_LITTLE_ENDIAN 4099 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 4100 tmp = crc_rv; // Tmp uses it's original register again. 4101 #endif 4102 4103 // Restore original table address for tailLoop. 4104 if (reconstructTableOffset != 0) { 4105 addi(table, table, -reconstructTableOffset); 4106 } 4107 4108 // Process last few (<complexThreshold) bytes of buffer. 4109 BIND(L_tail); 4110 update_byteLoop_crc32(crc, buf, len, table, data, false); 4111 4112 if (invertCRC) { 4113 nand(crc, crc, crc); // 1s complement of crc 4114 } 4115 BLOCK_COMMENT("} kernel_crc32_1word"); 4116 } 4117 4118 /** 4119 * @param crc register containing existing CRC (32-bit) 4120 * @param buf register pointing to input byte buffer (byte*) 4121 * @param len register containing number of bytes 4122 * @param constants register pointing to precomputed constants 4123 * @param t0-t6 temp registers 4124 */ 4125 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants, 4126 Register t0, Register t1, Register t2, Register t3, 4127 Register t4, Register t5, Register t6, bool invertCRC) { 4128 assert_different_registers(crc, buf, len, constants); 4129 4130 Label L_tail; 4131 4132 BLOCK_COMMENT("kernel_crc32_vpmsum {"); 4133 4134 if (invertCRC) { 4135 nand(crc, crc, crc); // 1s complement of crc 4136 } 4137 4138 // Enforce 32 bit. 4139 clrldi(len, len, 32); 4140 4141 // Align if we have enough bytes for the fast version. 4142 const int alignment = 16, 4143 threshold = 32; 4144 Register prealign = t0; 4145 4146 neg(prealign, buf); 4147 addi(t1, len, -threshold); 4148 andi(prealign, prealign, alignment - 1); 4149 cmpw(CCR0, t1, prealign); 4150 blt(CCR0, L_tail); // len - prealign < threshold? 4151 4152 subf(len, prealign, len); 4153 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false); 4154 4155 // Calculate from first aligned address as far as possible. 4156 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants. 4157 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6); 4158 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again. 4159 4160 // Remaining bytes. 4161 BIND(L_tail); 4162 update_byteLoop_crc32(crc, buf, len, constants, t2, false); 4163 4164 if (invertCRC) { 4165 nand(crc, crc, crc); // 1s complement of crc 4166 } 4167 4168 BLOCK_COMMENT("} kernel_crc32_vpmsum"); 4169 } 4170 4171 /** 4172 * @param crc register containing existing CRC (32-bit) 4173 * @param buf register pointing to input byte buffer (byte*) 4174 * @param len register containing number of bytes (will get updated to remaining bytes) 4175 * @param constants register pointing to CRC table for 128-bit aligned memory 4176 * @param t0-t6 temp registers 4177 */ 4178 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants, 4179 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) { 4180 4181 // Save non-volatile vector registers (frameless). 4182 Register offset = t1; 4183 int offsetInt = 0; 4184 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 4185 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 4186 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 4187 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 4188 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 4189 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 4190 #ifndef VM_LITTLE_ENDIAN 4191 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 4192 #endif 4193 offsetInt -= 8; std(R14, offsetInt, R1_SP); 4194 offsetInt -= 8; std(R15, offsetInt, R1_SP); 4195 4196 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 4197 // bytes per iteration. The basic scheme is: 4198 // lvx: load vector (Big Endian needs reversal) 4199 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 4200 // vxor: xor partial results together to get unroll_factor2 vectors 4201 4202 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 4203 4204 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 4205 const int unroll_factor = CRC32_UNROLL_FACTOR, 4206 unroll_factor2 = CRC32_UNROLL_FACTOR2; 4207 4208 const int outer_consts_size = (unroll_factor2 - 1) * 16, 4209 inner_consts_size = (unroll_factor / unroll_factor2) * 16; 4210 4211 // Support registers. 4212 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 }; 4213 Register num_bytes = R14, 4214 loop_count = R15, 4215 cur_const = crc; // will live in VCRC 4216 // Constant array for outer loop: unroll_factor2 - 1 registers, 4217 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 4218 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 4219 consts1[] = { VR23, VR24 }; 4220 // Data register arrays: 2 arrays with unroll_factor2 registers. 4221 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 4222 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 4223 4224 VectorRegister VCRC = data0[0]; 4225 VectorRegister Vc = VR25; 4226 VectorRegister swap_bytes = VR26; // Only for Big Endian. 4227 4228 // We have at least 1 iteration (ensured by caller). 4229 Label L_outer_loop, L_inner_loop, L_last; 4230 4231 // If supported set DSCR pre-fetch to deepest. 4232 if (VM_Version::has_mfdscr()) { 4233 load_const_optimized(t0, VM_Version::_dscr_val | 7); 4234 mtdscr(t0); 4235 } 4236 4237 mtvrwz(VCRC, crc); // crc lives in VCRC, now 4238 4239 for (int i = 1; i < unroll_factor2; ++i) { 4240 li(offs[i], 16 * i); 4241 } 4242 4243 // Load consts for outer loop 4244 lvx(consts0[0], constants); 4245 for (int i = 1; i < unroll_factor2 - 1; ++i) { 4246 lvx(consts0[i], offs[i], constants); 4247 } 4248 4249 load_const_optimized(num_bytes, 16 * unroll_factor); 4250 4251 // Reuse data registers outside of the loop. 4252 VectorRegister Vtmp = data1[0]; 4253 VectorRegister Vtmp2 = data1[1]; 4254 VectorRegister zeroes = data1[2]; 4255 4256 vspltisb(Vtmp, 0); 4257 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 4258 4259 // Load vector for vpermxor (to xor both 64 bit parts together) 4260 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 4261 vspltisb(Vc, 4); 4262 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 4263 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 4264 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 4265 4266 #ifdef VM_LITTLE_ENDIAN 4267 #define BE_swap_bytes(x) 4268 #else 4269 vspltisb(Vtmp2, 0xf); 4270 vxor(swap_bytes, Vtmp, Vtmp2); 4271 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 4272 #endif 4273 4274 cmpd(CCR0, len, num_bytes); 4275 blt(CCR0, L_last); 4276 4277 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop 4278 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 4279 4280 // ********** Main loop start ********** 4281 align(32); 4282 bind(L_outer_loop); 4283 4284 // Begin of unrolled first iteration (no xor). 4285 lvx(data1[0], buf); 4286 for (int i = 1; i < unroll_factor2 / 2; ++i) { 4287 lvx(data1[i], offs[i], buf); 4288 } 4289 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4290 lvx(consts1[0], cur_const); 4291 mtctr(loop_count); 4292 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4293 BE_swap_bytes(data1[i]); 4294 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 4295 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4296 vpmsumw(data0[i], data1[i], consts1[0]); 4297 } 4298 addi(buf, buf, 16 * unroll_factor2); 4299 subf(len, num_bytes, len); 4300 lvx(consts1[1], offs[1], cur_const); 4301 addi(cur_const, cur_const, 32); 4302 // Begin of unrolled second iteration (head). 4303 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4304 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4305 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 4306 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 4307 } 4308 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4309 BE_swap_bytes(data1[i]); 4310 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 4311 vpmsumw(data1[i], data1[i], consts1[1]); 4312 } 4313 addi(buf, buf, 16 * unroll_factor2); 4314 4315 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 4316 // Double-iteration allows using the 2 constant registers alternatingly. 4317 align(32); 4318 bind(L_inner_loop); 4319 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 4320 if (j & 1) { 4321 lvx(consts1[0], cur_const); 4322 } else { 4323 lvx(consts1[1], offs[1], cur_const); 4324 addi(cur_const, cur_const, 32); 4325 } 4326 for (int i = 0; i < unroll_factor2; ++i) { 4327 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 4328 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 4329 BE_swap_bytes(data1[idx]); 4330 vxor(data0[i], data0[i], data1[i]); 4331 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 4332 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 4333 } 4334 addi(buf, buf, 16 * unroll_factor2); 4335 } 4336 bdnz(L_inner_loop); 4337 4338 addi(cur_const, constants, outer_consts_size); // Reset 4339 4340 // Tail of last iteration (no loads). 4341 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4342 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 4343 vxor(data0[i], data0[i], data1[i]); 4344 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 4345 } 4346 for (int i = 0; i < unroll_factor2 / 2; ++i) { 4347 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 4348 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 4349 } 4350 4351 // Last data register is ok, other ones need fixup shift. 4352 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 4353 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 4354 } 4355 4356 // Combine to 128 bit result vector VCRC = data0[0]. 4357 for (int i = 1; i < unroll_factor2; i<<=1) { 4358 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 4359 vxor(data0[j], data0[j], data0[j+i]); 4360 } 4361 } 4362 cmpd(CCR0, len, num_bytes); 4363 bge(CCR0, L_outer_loop); 4364 4365 // Last chance with lower num_bytes. 4366 bind(L_last); 4367 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 4368 // Point behind last const for inner loop. 4369 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 4370 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 4371 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 4372 subf(cur_const, R0, cur_const); // Point to constant to be used first. 4373 4374 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 4375 bgt(CCR0, L_outer_loop); 4376 // ********** Main loop end ********** 4377 4378 // Restore DSCR pre-fetch value. 4379 if (VM_Version::has_mfdscr()) { 4380 load_const_optimized(t0, VM_Version::_dscr_val); 4381 mtdscr(t0); 4382 } 4383 4384 // ********** Simple loop for remaining 16 byte blocks ********** 4385 { 4386 Label L_loop, L_done; 4387 4388 srdi_(t0, len, 4); // 16 bytes per iteration 4389 clrldi(len, len, 64-4); 4390 beq(CCR0, L_done); 4391 4392 // Point to const (same as last const for inner loop). 4393 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); 4394 mtctr(t0); 4395 lvx(Vtmp2, cur_const); 4396 4397 align(32); 4398 bind(L_loop); 4399 4400 lvx(Vtmp, buf); 4401 addi(buf, buf, 16); 4402 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4403 BE_swap_bytes(Vtmp); 4404 vxor(VCRC, VCRC, Vtmp); 4405 vpmsumw(VCRC, VCRC, Vtmp2); 4406 bdnz(L_loop); 4407 4408 bind(L_done); 4409 } 4410 // ********** Simple loop end ********** 4411 #undef BE_swap_bytes 4412 4413 // Point to Barrett constants 4414 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 4415 4416 vspltisb(zeroes, 0); 4417 4418 // Combine to 64 bit result. 4419 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 4420 4421 // Reduce to 32 bit CRC: Remainder by multiply-high. 4422 lvx(Vtmp, cur_const); 4423 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 4424 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 4425 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 4426 vsldoi(Vtmp, zeroes, Vtmp, 8); 4427 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 4428 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 4429 4430 // Move result. len is already updated. 4431 vsldoi(VCRC, VCRC, zeroes, 8); 4432 mfvrd(crc, VCRC); 4433 4434 // Restore non-volatile Vector registers (frameless). 4435 offsetInt = 0; 4436 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 4437 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 4438 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 4439 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 4440 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 4441 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 4442 #ifndef VM_LITTLE_ENDIAN 4443 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 4444 #endif 4445 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 4446 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 4447 } 4448 4449 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2, 4450 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) { 4451 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr() 4452 : StubRoutines::crc_table_addr() , R0); 4453 4454 if (VM_Version::has_vpmsumb()) { 4455 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c); 4456 } else { 4457 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c); 4458 } 4459 } 4460 4461 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 4462 assert_different_registers(crc, val, table); 4463 4464 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 4465 if (invertCRC) { 4466 nand(crc, crc, crc); // 1s complement of crc 4467 } 4468 4469 update_byte_crc32(crc, val, table); 4470 4471 if (invertCRC) { 4472 nand(crc, crc, crc); // 1s complement of crc 4473 } 4474 } 4475 4476 // dest_lo += src1 + src2 4477 // dest_hi += carry1 + carry2 4478 void MacroAssembler::add2_with_carry(Register dest_hi, 4479 Register dest_lo, 4480 Register src1, Register src2) { 4481 li(R0, 0); 4482 addc(dest_lo, dest_lo, src1); 4483 adde(dest_hi, dest_hi, R0); 4484 addc(dest_lo, dest_lo, src2); 4485 adde(dest_hi, dest_hi, R0); 4486 } 4487 4488 // Multiply 64 bit by 64 bit first loop. 4489 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 4490 Register x_xstart, 4491 Register y, Register y_idx, 4492 Register z, 4493 Register carry, 4494 Register product_high, Register product, 4495 Register idx, Register kdx, 4496 Register tmp) { 4497 // jlong carry, x[], y[], z[]; 4498 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 4499 // huge_128 product = y[idx] * x[xstart] + carry; 4500 // z[kdx] = (jlong)product; 4501 // carry = (jlong)(product >>> 64); 4502 // } 4503 // z[xstart] = carry; 4504 4505 Label L_first_loop, L_first_loop_exit; 4506 Label L_one_x, L_one_y, L_multiply; 4507 4508 addic_(xstart, xstart, -1); 4509 blt(CCR0, L_one_x); // Special case: length of x is 1. 4510 4511 // Load next two integers of x. 4512 sldi(tmp, xstart, LogBytesPerInt); 4513 ldx(x_xstart, x, tmp); 4514 #ifdef VM_LITTLE_ENDIAN 4515 rldicl(x_xstart, x_xstart, 32, 0); 4516 #endif 4517 4518 align(32, 16); 4519 bind(L_first_loop); 4520 4521 cmpdi(CCR0, idx, 1); 4522 blt(CCR0, L_first_loop_exit); 4523 addi(idx, idx, -2); 4524 beq(CCR0, L_one_y); 4525 4526 // Load next two integers of y. 4527 sldi(tmp, idx, LogBytesPerInt); 4528 ldx(y_idx, y, tmp); 4529 #ifdef VM_LITTLE_ENDIAN 4530 rldicl(y_idx, y_idx, 32, 0); 4531 #endif 4532 4533 4534 bind(L_multiply); 4535 multiply64(product_high, product, x_xstart, y_idx); 4536 4537 li(tmp, 0); 4538 addc(product, product, carry); // Add carry to result. 4539 adde(product_high, product_high, tmp); // Add carry of the last addition. 4540 addi(kdx, kdx, -2); 4541 4542 // Store result. 4543 #ifdef VM_LITTLE_ENDIAN 4544 rldicl(product, product, 32, 0); 4545 #endif 4546 sldi(tmp, kdx, LogBytesPerInt); 4547 stdx(product, z, tmp); 4548 mr_if_needed(carry, product_high); 4549 b(L_first_loop); 4550 4551 4552 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 4553 4554 lwz(y_idx, 0, y); 4555 b(L_multiply); 4556 4557 4558 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 4559 4560 lwz(x_xstart, 0, x); 4561 b(L_first_loop); 4562 4563 bind(L_first_loop_exit); 4564 } 4565 4566 // Multiply 64 bit by 64 bit and add 128 bit. 4567 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 4568 Register z, Register yz_idx, 4569 Register idx, Register carry, 4570 Register product_high, Register product, 4571 Register tmp, int offset) { 4572 4573 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 4574 // z[kdx] = (jlong)product; 4575 4576 sldi(tmp, idx, LogBytesPerInt); 4577 if (offset) { 4578 addi(tmp, tmp, offset); 4579 } 4580 ldx(yz_idx, y, tmp); 4581 #ifdef VM_LITTLE_ENDIAN 4582 rldicl(yz_idx, yz_idx, 32, 0); 4583 #endif 4584 4585 multiply64(product_high, product, x_xstart, yz_idx); 4586 ldx(yz_idx, z, tmp); 4587 #ifdef VM_LITTLE_ENDIAN 4588 rldicl(yz_idx, yz_idx, 32, 0); 4589 #endif 4590 4591 add2_with_carry(product_high, product, carry, yz_idx); 4592 4593 sldi(tmp, idx, LogBytesPerInt); 4594 if (offset) { 4595 addi(tmp, tmp, offset); 4596 } 4597 #ifdef VM_LITTLE_ENDIAN 4598 rldicl(product, product, 32, 0); 4599 #endif 4600 stdx(product, z, tmp); 4601 } 4602 4603 // Multiply 128 bit by 128 bit. Unrolled inner loop. 4604 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 4605 Register y, Register z, 4606 Register yz_idx, Register idx, Register carry, 4607 Register product_high, Register product, 4608 Register carry2, Register tmp) { 4609 4610 // jlong carry, x[], y[], z[]; 4611 // int kdx = ystart+1; 4612 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 4613 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 4614 // z[kdx+idx+1] = (jlong)product; 4615 // jlong carry2 = (jlong)(product >>> 64); 4616 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 4617 // z[kdx+idx] = (jlong)product; 4618 // carry = (jlong)(product >>> 64); 4619 // } 4620 // idx += 2; 4621 // if (idx > 0) { 4622 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 4623 // z[kdx+idx] = (jlong)product; 4624 // carry = (jlong)(product >>> 64); 4625 // } 4626 4627 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 4628 const Register jdx = R0; 4629 4630 // Scale the index. 4631 srdi_(jdx, idx, 2); 4632 beq(CCR0, L_third_loop_exit); 4633 mtctr(jdx); 4634 4635 align(32, 16); 4636 bind(L_third_loop); 4637 4638 addi(idx, idx, -4); 4639 4640 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 4641 mr_if_needed(carry2, product_high); 4642 4643 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 4644 mr_if_needed(carry, product_high); 4645 bdnz(L_third_loop); 4646 4647 bind(L_third_loop_exit); // Handle any left-over operand parts. 4648 4649 andi_(idx, idx, 0x3); 4650 beq(CCR0, L_post_third_loop_done); 4651 4652 Label L_check_1; 4653 4654 addic_(idx, idx, -2); 4655 blt(CCR0, L_check_1); 4656 4657 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 4658 mr_if_needed(carry, product_high); 4659 4660 bind(L_check_1); 4661 4662 addi(idx, idx, 0x2); 4663 andi_(idx, idx, 0x1); 4664 addic_(idx, idx, -1); 4665 blt(CCR0, L_post_third_loop_done); 4666 4667 sldi(tmp, idx, LogBytesPerInt); 4668 lwzx(yz_idx, y, tmp); 4669 multiply64(product_high, product, x_xstart, yz_idx); 4670 lwzx(yz_idx, z, tmp); 4671 4672 add2_with_carry(product_high, product, yz_idx, carry); 4673 4674 sldi(tmp, idx, LogBytesPerInt); 4675 stwx(product, z, tmp); 4676 srdi(product, product, 32); 4677 4678 sldi(product_high, product_high, 32); 4679 orr(product, product, product_high); 4680 mr_if_needed(carry, product); 4681 4682 bind(L_post_third_loop_done); 4683 } // multiply_128_x_128_loop 4684 4685 void MacroAssembler::muladd(Register out, Register in, 4686 Register offset, Register len, Register k, 4687 Register tmp1, Register tmp2, Register carry) { 4688 4689 // Labels 4690 Label LOOP, SKIP; 4691 4692 // Make sure length is positive. 4693 cmpdi (CCR0, len, 0); 4694 4695 // Prepare variables 4696 subi (offset, offset, 4); 4697 li (carry, 0); 4698 ble (CCR0, SKIP); 4699 4700 mtctr (len); 4701 subi (len, len, 1 ); 4702 sldi (len, len, 2 ); 4703 4704 // Main loop 4705 bind(LOOP); 4706 lwzx (tmp1, len, in ); 4707 lwzx (tmp2, offset, out ); 4708 mulld (tmp1, tmp1, k ); 4709 add (tmp2, carry, tmp2 ); 4710 add (tmp2, tmp1, tmp2 ); 4711 stwx (tmp2, offset, out ); 4712 srdi (carry, tmp2, 32 ); 4713 subi (offset, offset, 4 ); 4714 subi (len, len, 4 ); 4715 bdnz (LOOP); 4716 bind(SKIP); 4717 } 4718 4719 void MacroAssembler::multiply_to_len(Register x, Register xlen, 4720 Register y, Register ylen, 4721 Register z, Register zlen, 4722 Register tmp1, Register tmp2, 4723 Register tmp3, Register tmp4, 4724 Register tmp5, Register tmp6, 4725 Register tmp7, Register tmp8, 4726 Register tmp9, Register tmp10, 4727 Register tmp11, Register tmp12, 4728 Register tmp13) { 4729 4730 ShortBranchVerifier sbv(this); 4731 4732 assert_different_registers(x, xlen, y, ylen, z, zlen, 4733 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 4734 assert_different_registers(x, xlen, y, ylen, z, zlen, 4735 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 4736 assert_different_registers(x, xlen, y, ylen, z, zlen, 4737 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 4738 4739 const Register idx = tmp1; 4740 const Register kdx = tmp2; 4741 const Register xstart = tmp3; 4742 4743 const Register y_idx = tmp4; 4744 const Register carry = tmp5; 4745 const Register product = tmp6; 4746 const Register product_high = tmp7; 4747 const Register x_xstart = tmp8; 4748 const Register tmp = tmp9; 4749 4750 // First Loop. 4751 // 4752 // final static long LONG_MASK = 0xffffffffL; 4753 // int xstart = xlen - 1; 4754 // int ystart = ylen - 1; 4755 // long carry = 0; 4756 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 4757 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 4758 // z[kdx] = (int)product; 4759 // carry = product >>> 32; 4760 // } 4761 // z[xstart] = (int)carry; 4762 4763 mr_if_needed(idx, ylen); // idx = ylen 4764 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 4765 li(carry, 0); // carry = 0 4766 4767 Label L_done; 4768 4769 addic_(xstart, xlen, -1); 4770 blt(CCR0, L_done); 4771 4772 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 4773 carry, product_high, product, idx, kdx, tmp); 4774 4775 Label L_second_loop; 4776 4777 cmpdi(CCR0, kdx, 0); 4778 beq(CCR0, L_second_loop); 4779 4780 Label L_carry; 4781 4782 addic_(kdx, kdx, -1); 4783 beq(CCR0, L_carry); 4784 4785 // Store lower 32 bits of carry. 4786 sldi(tmp, kdx, LogBytesPerInt); 4787 stwx(carry, z, tmp); 4788 srdi(carry, carry, 32); 4789 addi(kdx, kdx, -1); 4790 4791 4792 bind(L_carry); 4793 4794 // Store upper 32 bits of carry. 4795 sldi(tmp, kdx, LogBytesPerInt); 4796 stwx(carry, z, tmp); 4797 4798 // Second and third (nested) loops. 4799 // 4800 // for (int i = xstart-1; i >= 0; i--) { // Second loop 4801 // carry = 0; 4802 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 4803 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 4804 // (z[k] & LONG_MASK) + carry; 4805 // z[k] = (int)product; 4806 // carry = product >>> 32; 4807 // } 4808 // z[i] = (int)carry; 4809 // } 4810 // 4811 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4812 4813 bind(L_second_loop); 4814 4815 li(carry, 0); // carry = 0; 4816 4817 addic_(xstart, xstart, -1); // i = xstart-1; 4818 blt(CCR0, L_done); 4819 4820 Register zsave = tmp10; 4821 4822 mr(zsave, z); 4823 4824 4825 Label L_last_x; 4826 4827 sldi(tmp, xstart, LogBytesPerInt); 4828 add(z, z, tmp); // z = z + k - j 4829 addi(z, z, 4); 4830 addic_(xstart, xstart, -1); // i = xstart-1; 4831 blt(CCR0, L_last_x); 4832 4833 sldi(tmp, xstart, LogBytesPerInt); 4834 ldx(x_xstart, x, tmp); 4835 #ifdef VM_LITTLE_ENDIAN 4836 rldicl(x_xstart, x_xstart, 32, 0); 4837 #endif 4838 4839 4840 Label L_third_loop_prologue; 4841 4842 bind(L_third_loop_prologue); 4843 4844 Register xsave = tmp11; 4845 Register xlensave = tmp12; 4846 Register ylensave = tmp13; 4847 4848 mr(xsave, x); 4849 mr(xlensave, xstart); 4850 mr(ylensave, ylen); 4851 4852 4853 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4854 carry, product_high, product, x, tmp); 4855 4856 mr(z, zsave); 4857 mr(x, xsave); 4858 mr(xlen, xlensave); // This is the decrement of the loop counter! 4859 mr(ylen, ylensave); 4860 4861 addi(tmp3, xlen, 1); 4862 sldi(tmp, tmp3, LogBytesPerInt); 4863 stwx(carry, z, tmp); 4864 addic_(tmp3, tmp3, -1); 4865 blt(CCR0, L_done); 4866 4867 srdi(carry, carry, 32); 4868 sldi(tmp, tmp3, LogBytesPerInt); 4869 stwx(carry, z, tmp); 4870 b(L_second_loop); 4871 4872 // Next infrequent code is moved outside loops. 4873 bind(L_last_x); 4874 4875 lwz(x_xstart, 0, x); 4876 b(L_third_loop_prologue); 4877 4878 bind(L_done); 4879 } // multiply_to_len 4880 4881 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { 4882 #ifdef ASSERT 4883 Label ok; 4884 if (check_equal) { 4885 beq(CCR0, ok); 4886 } else { 4887 bne(CCR0, ok); 4888 } 4889 stop(msg, id); 4890 bind(ok); 4891 #endif 4892 } 4893 4894 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4895 Register mem_base, const char* msg, int id) { 4896 #ifdef ASSERT 4897 switch (size) { 4898 case 4: 4899 lwz(R0, mem_offset, mem_base); 4900 cmpwi(CCR0, R0, 0); 4901 break; 4902 case 8: 4903 ld(R0, mem_offset, mem_base); 4904 cmpdi(CCR0, R0, 0); 4905 break; 4906 default: 4907 ShouldNotReachHere(); 4908 } 4909 asm_assert(check_equal, msg, id); 4910 #endif // ASSERT 4911 } 4912 4913 void MacroAssembler::verify_thread() { 4914 if (VerifyThread) { 4915 unimplemented("'VerifyThread' currently not implemented on PPC"); 4916 } 4917 } 4918 4919 // READ: oop. KILL: R0. Volatile floats perhaps. 4920 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4921 if (!VerifyOops) { 4922 return; 4923 } 4924 4925 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4926 const Register tmp = R11; // Will be preserved. 4927 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4928 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4929 4930 mr_if_needed(R4_ARG2, oop); 4931 save_LR_CR(tmp); // save in old frame 4932 push_frame_reg_args(nbytes_save, tmp); 4933 // load FunctionDescriptor** / entry_address * 4934 load_const_optimized(tmp, fd, R0); 4935 // load FunctionDescriptor* / entry_address 4936 ld(tmp, 0, tmp); 4937 load_const_optimized(R3_ARG1, (address)msg, R0); 4938 // Call destination for its side effect. 4939 call_c(tmp); 4940 4941 pop_frame(); 4942 restore_LR_CR(tmp); 4943 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4944 } 4945 4946 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4947 if (!VerifyOops) { 4948 return; 4949 } 4950 4951 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4952 const Register tmp = R11; // Will be preserved. 4953 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4954 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4955 4956 ld(R4_ARG2, offs, base); 4957 save_LR_CR(tmp); // save in old frame 4958 push_frame_reg_args(nbytes_save, tmp); 4959 // load FunctionDescriptor** / entry_address * 4960 load_const_optimized(tmp, fd, R0); 4961 // load FunctionDescriptor* / entry_address 4962 ld(tmp, 0, tmp); 4963 load_const_optimized(R3_ARG1, (address)msg, R0); 4964 // Call destination for its side effect. 4965 call_c(tmp); 4966 4967 pop_frame(); 4968 restore_LR_CR(tmp); 4969 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4970 } 4971 4972 const char* stop_types[] = { 4973 "stop", 4974 "untested", 4975 "unimplemented", 4976 "shouldnotreachhere" 4977 }; 4978 4979 static void stop_on_request(int tp, const char* msg) { 4980 tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg); 4981 guarantee(false, "PPC assembly code requires stop: %s", msg); 4982 } 4983 4984 // Call a C-function that prints output. 4985 void MacroAssembler::stop(int type, const char* msg, int id) { 4986 #ifndef PRODUCT 4987 block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg)); 4988 #else 4989 block_comment("stop {"); 4990 #endif 4991 4992 // setup arguments 4993 load_const_optimized(R3_ARG1, type); 4994 load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0); 4995 call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2); 4996 illtrap(); 4997 emit_int32(id); 4998 block_comment("} stop;"); 4999 } 5000 5001 #ifndef PRODUCT 5002 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 5003 // Val, addr are temp registers. 5004 // If low == addr, addr is killed. 5005 // High is preserved. 5006 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 5007 if (!ZapMemory) return; 5008 5009 assert_different_registers(low, val); 5010 5011 BLOCK_COMMENT("zap memory region {"); 5012 load_const_optimized(val, 0x0101010101010101); 5013 int size = before + after; 5014 if (low == high && size < 5 && size > 0) { 5015 int offset = -before*BytesPerWord; 5016 for (int i = 0; i < size; ++i) { 5017 std(val, offset, low); 5018 offset += (1*BytesPerWord); 5019 } 5020 } else { 5021 addi(addr, low, -before*BytesPerWord); 5022 assert_different_registers(high, val); 5023 if (after) addi(high, high, after * BytesPerWord); 5024 Label loop; 5025 bind(loop); 5026 std(val, 0, addr); 5027 addi(addr, addr, 8); 5028 cmpd(CCR6, addr, high); 5029 ble(CCR6, loop); 5030 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 5031 } 5032 BLOCK_COMMENT("} zap memory region"); 5033 } 5034 5035 #endif // !PRODUCT 5036 5037 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 5038 const bool* flag_addr, Label& label) { 5039 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 5040 assert(sizeof(bool) == 1, "PowerPC ABI"); 5041 masm->lbz(temp, simm16_offset, temp); 5042 masm->cmpwi(CCR0, temp, 0); 5043 masm->beq(CCR0, label); 5044 } 5045 5046 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 5047 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 5048 } 5049 5050 SkipIfEqualZero::~SkipIfEqualZero() { 5051 _masm->bind(_label); 5052 }