1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "memory/universe.hpp"
  40 #include "nativeInst_aarch64.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedOops.inline.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "runtime/biasedLocking.hpp"
  45 #include "runtime/icache.hpp"
  46 #include "runtime/interfaceSupport.inline.hpp"
  47 #include "runtime/jniHandles.inline.hpp"
  48 #include "runtime/sharedRuntime.hpp"
  49 #include "runtime/thread.hpp"
  50 #ifdef COMPILER1
  51 #include "c1/c1_LIRAssembler.hpp"
  52 #endif
  53 #ifdef COMPILER2
  54 #include "oops/oop.hpp"
  55 #include "opto/compile.hpp"
  56 #include "opto/intrinsicnode.hpp"
  57 #include "opto/node.hpp"
  58 #endif
  59 
  60 #ifdef PRODUCT
  61 #define BLOCK_COMMENT(str) /* nothing */
  62 #define STOP(error) stop(error)
  63 #else
  64 #define BLOCK_COMMENT(str) block_comment(str)
  65 #define STOP(error) block_comment(error); stop(error)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Patch any kind of instruction; there may be several instructions.
  71 // Return the total length (in bytes) of the instructions.
  72 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  73   int instructions = 1;
  74   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  75   long offset = (target - branch) >> 2;
  76   unsigned insn = *(unsigned*)branch;
  77   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  78     // Load register (literal)
  79     Instruction_aarch64::spatch(branch, 23, 5, offset);
  80   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  81     // Unconditional branch (immediate)
  82     Instruction_aarch64::spatch(branch, 25, 0, offset);
  83   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  84     // Conditional branch (immediate)
  85     Instruction_aarch64::spatch(branch, 23, 5, offset);
  86   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  87     // Compare & branch (immediate)
  88     Instruction_aarch64::spatch(branch, 23, 5, offset);
  89   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  90     // Test & branch (immediate)
  91     Instruction_aarch64::spatch(branch, 18, 5, offset);
  92   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  93     // PC-rel. addressing
  94     offset = target-branch;
  95     int shift = Instruction_aarch64::extract(insn, 31, 31);
  96     if (shift) {
  97       u_int64_t dest = (u_int64_t)target;
  98       uint64_t pc_page = (uint64_t)branch >> 12;
  99       uint64_t adr_page = (uint64_t)target >> 12;
 100       unsigned offset_lo = dest & 0xfff;
 101       offset = adr_page - pc_page;
 102 
 103       // We handle 4 types of PC relative addressing
 104       //   1 - adrp    Rx, target_page
 105       //       ldr/str Ry, [Rx, #offset_in_page]
 106       //   2 - adrp    Rx, target_page
 107       //       add     Ry, Rx, #offset_in_page
 108       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 109       //       movk    Rx, #imm16<<32
 110       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 111       // In the first 3 cases we must check that Rx is the same in the adrp and the
 112       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 113       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 114       // to be followed by a random unrelated ldr/str, add or movk instruction.
 115       //
 116       unsigned insn2 = ((unsigned*)branch)[1];
 117       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 118                 Instruction_aarch64::extract(insn, 4, 0) ==
 119                         Instruction_aarch64::extract(insn2, 9, 5)) {
 120         // Load/store register (unsigned immediate)
 121         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 122         Instruction_aarch64::patch(branch + sizeof (unsigned),
 123                                     21, 10, offset_lo >> size);
 124         guarantee(((dest >> size) << size) == dest, "misaligned target");
 125         instructions = 2;
 126       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 127                 Instruction_aarch64::extract(insn, 4, 0) ==
 128                         Instruction_aarch64::extract(insn2, 4, 0)) {
 129         // add (immediate)
 130         Instruction_aarch64::patch(branch + sizeof (unsigned),
 131                                    21, 10, offset_lo);
 132         instructions = 2;
 133       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 134                    Instruction_aarch64::extract(insn, 4, 0) ==
 135                      Instruction_aarch64::extract(insn2, 4, 0)) {
 136         // movk #imm16<<32
 137         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 138         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 139         long pc_page = (long)branch >> 12;
 140         long adr_page = (long)dest >> 12;
 141         offset = adr_page - pc_page;
 142         instructions = 2;
 143       }
 144     }
 145     int offset_lo = offset & 3;
 146     offset >>= 2;
 147     Instruction_aarch64::spatch(branch, 23, 5, offset);
 148     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 149   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 150     u_int64_t dest = (u_int64_t)target;
 151     // Move wide constant
 152     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 153     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 154     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 155     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 156     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 157     assert(target_addr_for_insn(branch) == target, "should be");
 158     instructions = 3;
 159   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 160              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 161     // nothing to do
 162     assert(target == 0, "did not expect to relocate target for polling page load");
 163   } else {
 164     ShouldNotReachHere();
 165   }
 166   return instructions * NativeInstruction::instruction_size;
 167 }
 168 
 169 int MacroAssembler::patch_oop(address insn_addr, address o) {
 170   int instructions;
 171   unsigned insn = *(unsigned*)insn_addr;
 172   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 173 
 174   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 175   // narrow OOPs by setting the upper 16 bits in the first
 176   // instruction.
 177   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 178     // Move narrow OOP
 179     narrowOop n = CompressedOops::encode((oop)o);
 180     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 181     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 182     instructions = 2;
 183   } else {
 184     // Move wide OOP
 185     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 186     uintptr_t dest = (uintptr_t)o;
 187     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 188     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 189     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 190     instructions = 3;
 191   }
 192   return instructions * NativeInstruction::instruction_size;
 193 }
 194 
 195 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 196   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 197   // We encode narrow ones by setting the upper 16 bits in the first
 198   // instruction.
 199   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 200   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 201          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 202 
 203   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 204   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 205   return 2 * NativeInstruction::instruction_size;
 206 }
 207 
 208 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 209   long offset = 0;
 210   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 211     // Load register (literal)
 212     offset = Instruction_aarch64::sextract(insn, 23, 5);
 213     return address(((uint64_t)insn_addr + (offset << 2)));
 214   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 215     // Unconditional branch (immediate)
 216     offset = Instruction_aarch64::sextract(insn, 25, 0);
 217   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 218     // Conditional branch (immediate)
 219     offset = Instruction_aarch64::sextract(insn, 23, 5);
 220   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 221     // Compare & branch (immediate)
 222     offset = Instruction_aarch64::sextract(insn, 23, 5);
 223    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 224     // Test & branch (immediate)
 225     offset = Instruction_aarch64::sextract(insn, 18, 5);
 226   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 227     // PC-rel. addressing
 228     offset = Instruction_aarch64::extract(insn, 30, 29);
 229     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 230     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 231     if (shift) {
 232       offset <<= shift;
 233       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 234       target_page &= ((uint64_t)-1) << shift;
 235       // Return the target address for the following sequences
 236       //   1 - adrp    Rx, target_page
 237       //       ldr/str Ry, [Rx, #offset_in_page]
 238       //   2 - adrp    Rx, target_page
 239       //       add     Ry, Rx, #offset_in_page
 240       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 241       //       movk    Rx, #imm12<<32
 242       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 243       //
 244       // In the first two cases  we check that the register is the same and
 245       // return the target_page + the offset within the page.
 246       // Otherwise we assume it is a page aligned relocation and return
 247       // the target page only.
 248       //
 249       unsigned insn2 = ((unsigned*)insn_addr)[1];
 250       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 251                 Instruction_aarch64::extract(insn, 4, 0) ==
 252                         Instruction_aarch64::extract(insn2, 9, 5)) {
 253         // Load/store register (unsigned immediate)
 254         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 255         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 256         return address(target_page + (byte_offset << size));
 257       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 258                 Instruction_aarch64::extract(insn, 4, 0) ==
 259                         Instruction_aarch64::extract(insn2, 4, 0)) {
 260         // add (immediate)
 261         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 262         return address(target_page + byte_offset);
 263       } else {
 264         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 265                Instruction_aarch64::extract(insn, 4, 0) ==
 266                  Instruction_aarch64::extract(insn2, 4, 0)) {
 267           target_page = (target_page & 0xffffffff) |
 268                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 269         }
 270         return (address)target_page;
 271       }
 272     } else {
 273       ShouldNotReachHere();
 274     }
 275   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 276     u_int32_t *insns = (u_int32_t *)insn_addr;
 277     // Move wide constant: movz, movk, movk.  See movptr().
 278     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 279     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 280     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 281                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 282                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 283   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 284              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 285     return 0;
 286   } else {
 287     ShouldNotReachHere();
 288   }
 289   return address(((uint64_t)insn_addr + (offset << 2)));
 290 }
 291 
 292 void MacroAssembler::safepoint_poll(Label& slow_path) {
 293   if (SafepointMechanism::uses_thread_local_poll()) {
 294     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 295     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 296   } else {
 297     unsigned long offset;
 298     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 299     ldrw(rscratch1, Address(rscratch1, offset));
 300     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 301     cbnz(rscratch1, slow_path);
 302   }
 303 }
 304 
 305 // Just like safepoint_poll, but use an acquiring load for thread-
 306 // local polling.
 307 //
 308 // We need an acquire here to ensure that any subsequent load of the
 309 // global SafepointSynchronize::_state flag is ordered after this load
 310 // of the local Thread::_polling page.  We don't want this poll to
 311 // return false (i.e. not safepointing) and a later poll of the global
 312 // SafepointSynchronize::_state spuriously to return true.
 313 //
 314 // This is to avoid a race when we're in a native->Java transition
 315 // racing the code which wakes up from a safepoint.
 316 //
 317 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 318   if (SafepointMechanism::uses_thread_local_poll()) {
 319     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 320     ldar(rscratch1, rscratch1);
 321     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 322   } else {
 323     safepoint_poll(slow_path);
 324   }
 325 }
 326 
 327 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 328   // we must set sp to zero to clear frame
 329   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 330 
 331   // must clear fp, so that compiled frames are not confused; it is
 332   // possible that we need it only for debugging
 333   if (clear_fp) {
 334     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 335   }
 336 
 337   // Always clear the pc because it could have been set by make_walkable()
 338   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 339 }
 340 
 341 // Calls to C land
 342 //
 343 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 344 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 345 // has to be reset to 0. This is required to allow proper stack traversal.
 346 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 347                                          Register last_java_fp,
 348                                          Register last_java_pc,
 349                                          Register scratch) {
 350 
 351   if (last_java_pc->is_valid()) {
 352       str(last_java_pc, Address(rthread,
 353                                 JavaThread::frame_anchor_offset()
 354                                 + JavaFrameAnchor::last_Java_pc_offset()));
 355     }
 356 
 357   // determine last_java_sp register
 358   if (last_java_sp == sp) {
 359     mov(scratch, sp);
 360     last_java_sp = scratch;
 361   } else if (!last_java_sp->is_valid()) {
 362     last_java_sp = esp;
 363   }
 364 
 365   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 366 
 367   // last_java_fp is optional
 368   if (last_java_fp->is_valid()) {
 369     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 370   }
 371 }
 372 
 373 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 374                                          Register last_java_fp,
 375                                          address  last_java_pc,
 376                                          Register scratch) {
 377   assert(last_java_pc != NULL, "must provide a valid PC");
 378 
 379   adr(scratch, last_java_pc);
 380   str(scratch, Address(rthread,
 381                        JavaThread::frame_anchor_offset()
 382                        + JavaFrameAnchor::last_Java_pc_offset()));
 383 
 384   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 385 }
 386 
 387 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 388                                          Register last_java_fp,
 389                                          Label &L,
 390                                          Register scratch) {
 391   if (L.is_bound()) {
 392     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 393   } else {
 394     InstructionMark im(this);
 395     L.add_patch_at(code(), locator());
 396     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
 397   }
 398 }
 399 
 400 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 401   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 402   assert(CodeCache::find_blob(entry.target()) != NULL,
 403          "destination of far call not found in code cache");
 404   if (far_branches()) {
 405     unsigned long offset;
 406     // We can use ADRP here because we know that the total size of
 407     // the code cache cannot exceed 2Gb.
 408     adrp(tmp, entry, offset);
 409     add(tmp, tmp, offset);
 410     if (cbuf) cbuf->set_insts_mark();
 411     blr(tmp);
 412   } else {
 413     if (cbuf) cbuf->set_insts_mark();
 414     bl(entry);
 415   }
 416 }
 417 
 418 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 419   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 420   assert(CodeCache::find_blob(entry.target()) != NULL,
 421          "destination of far call not found in code cache");
 422   if (far_branches()) {
 423     unsigned long offset;
 424     // We can use ADRP here because we know that the total size of
 425     // the code cache cannot exceed 2Gb.
 426     adrp(tmp, entry, offset);
 427     add(tmp, tmp, offset);
 428     if (cbuf) cbuf->set_insts_mark();
 429     br(tmp);
 430   } else {
 431     if (cbuf) cbuf->set_insts_mark();
 432     b(entry);
 433   }
 434 }
 435 
 436 void MacroAssembler::reserved_stack_check() {
 437     // testing if reserved zone needs to be enabled
 438     Label no_reserved_zone_enabling;
 439 
 440     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 441     cmp(sp, rscratch1);
 442     br(Assembler::LO, no_reserved_zone_enabling);
 443 
 444     enter();   // LR and FP are live.
 445     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 446     mov(c_rarg0, rthread);
 447     blr(rscratch1);
 448     leave();
 449 
 450     // We have already removed our own frame.
 451     // throw_delayed_StackOverflowError will think that it's been
 452     // called by our caller.
 453     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 454     br(rscratch1);
 455     should_not_reach_here();
 456 
 457     bind(no_reserved_zone_enabling);
 458 }
 459 
 460 int MacroAssembler::biased_locking_enter(Register lock_reg,
 461                                          Register obj_reg,
 462                                          Register swap_reg,
 463                                          Register tmp_reg,
 464                                          bool swap_reg_contains_mark,
 465                                          Label& done,
 466                                          Label* slow_case,
 467                                          BiasedLockingCounters* counters) {
 468   assert(UseBiasedLocking, "why call this otherwise?");
 469   assert_different_registers(lock_reg, obj_reg, swap_reg);
 470 
 471   if (PrintBiasedLockingStatistics && counters == NULL)
 472     counters = BiasedLocking::counters();
 473 
 474   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 475   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 476   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 477   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 478   Address saved_mark_addr(lock_reg, 0);
 479 
 480   // Biased locking
 481   // See whether the lock is currently biased toward our thread and
 482   // whether the epoch is still valid
 483   // Note that the runtime guarantees sufficient alignment of JavaThread
 484   // pointers to allow age to be placed into low bits
 485   // First check to see whether biasing is even enabled for this object
 486   Label cas_label;
 487   int null_check_offset = -1;
 488   if (!swap_reg_contains_mark) {
 489     null_check_offset = offset();
 490     ldr(swap_reg, mark_addr);
 491   }
 492   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 493   cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
 494   br(Assembler::NE, cas_label);
 495   // The bias pattern is present in the object's header. Need to check
 496   // whether the bias owner and the epoch are both still current.
 497   load_prototype_header(tmp_reg, obj_reg);
 498   orr(tmp_reg, tmp_reg, rthread);
 499   eor(tmp_reg, swap_reg, tmp_reg);
 500   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 501   if (counters != NULL) {
 502     Label around;
 503     cbnz(tmp_reg, around);
 504     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 505     b(done);
 506     bind(around);
 507   } else {
 508     cbz(tmp_reg, done);
 509   }
 510 
 511   Label try_revoke_bias;
 512   Label try_rebias;
 513 
 514   // At this point we know that the header has the bias pattern and
 515   // that we are not the bias owner in the current epoch. We need to
 516   // figure out more details about the state of the header in order to
 517   // know what operations can be legally performed on the object's
 518   // header.
 519 
 520   // If the low three bits in the xor result aren't clear, that means
 521   // the prototype header is no longer biased and we have to revoke
 522   // the bias on this object.
 523   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 524   cbnz(rscratch1, try_revoke_bias);
 525 
 526   // Biasing is still enabled for this data type. See whether the
 527   // epoch of the current bias is still valid, meaning that the epoch
 528   // bits of the mark word are equal to the epoch bits of the
 529   // prototype header. (Note that the prototype header's epoch bits
 530   // only change at a safepoint.) If not, attempt to rebias the object
 531   // toward the current thread. Note that we must be absolutely sure
 532   // that the current epoch is invalid in order to do this because
 533   // otherwise the manipulations it performs on the mark word are
 534   // illegal.
 535   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 536   cbnz(rscratch1, try_rebias);
 537 
 538   // The epoch of the current bias is still valid but we know nothing
 539   // about the owner; it might be set or it might be clear. Try to
 540   // acquire the bias of the object using an atomic operation. If this
 541   // fails we will go in to the runtime to revoke the object's bias.
 542   // Note that we first construct the presumed unbiased header so we
 543   // don't accidentally blow away another thread's valid bias.
 544   {
 545     Label here;
 546     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 547     andr(swap_reg, swap_reg, rscratch1);
 548     orr(tmp_reg, swap_reg, rthread);
 549     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 550     // If the biasing toward our thread failed, this means that
 551     // another thread succeeded in biasing it toward itself and we
 552     // need to revoke that bias. The revocation will occur in the
 553     // interpreter runtime in the slow case.
 554     bind(here);
 555     if (counters != NULL) {
 556       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 557                   tmp_reg, rscratch1, rscratch2);
 558     }
 559   }
 560   b(done);
 561 
 562   bind(try_rebias);
 563   // At this point we know the epoch has expired, meaning that the
 564   // current "bias owner", if any, is actually invalid. Under these
 565   // circumstances _only_, we are allowed to use the current header's
 566   // value as the comparison value when doing the cas to acquire the
 567   // bias in the current epoch. In other words, we allow transfer of
 568   // the bias from one thread to another directly in this situation.
 569   //
 570   // FIXME: due to a lack of registers we currently blow away the age
 571   // bits in this situation. Should attempt to preserve them.
 572   {
 573     Label here;
 574     load_prototype_header(tmp_reg, obj_reg);
 575     orr(tmp_reg, rthread, tmp_reg);
 576     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 577     // If the biasing toward our thread failed, then another thread
 578     // succeeded in biasing it toward itself and we need to revoke that
 579     // bias. The revocation will occur in the runtime in the slow case.
 580     bind(here);
 581     if (counters != NULL) {
 582       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 583                   tmp_reg, rscratch1, rscratch2);
 584     }
 585   }
 586   b(done);
 587 
 588   bind(try_revoke_bias);
 589   // The prototype mark in the klass doesn't have the bias bit set any
 590   // more, indicating that objects of this data type are not supposed
 591   // to be biased any more. We are going to try to reset the mark of
 592   // this object to the prototype value and fall through to the
 593   // CAS-based locking scheme. Note that if our CAS fails, it means
 594   // that another thread raced us for the privilege of revoking the
 595   // bias of this particular object, so it's okay to continue in the
 596   // normal locking code.
 597   //
 598   // FIXME: due to a lack of registers we currently blow away the age
 599   // bits in this situation. Should attempt to preserve them.
 600   {
 601     Label here, nope;
 602     load_prototype_header(tmp_reg, obj_reg);
 603     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 604     bind(here);
 605 
 606     // Fall through to the normal CAS-based lock, because no matter what
 607     // the result of the above CAS, some thread must have succeeded in
 608     // removing the bias bit from the object's header.
 609     if (counters != NULL) {
 610       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 611                   rscratch1, rscratch2);
 612     }
 613     bind(nope);
 614   }
 615 
 616   bind(cas_label);
 617 
 618   return null_check_offset;
 619 }
 620 
 621 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 622   assert(UseBiasedLocking, "why call this otherwise?");
 623 
 624   // Check for biased locking unlock case, which is a no-op
 625   // Note: we do not have to check the thread ID for two reasons.
 626   // First, the interpreter checks for IllegalMonitorStateException at
 627   // a higher level. Second, if the bias was revoked while we held the
 628   // lock, the object could not be rebiased toward another thread, so
 629   // the bias bit would be clear.
 630   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 631   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 632   cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
 633   br(Assembler::EQ, done);
 634 }
 635 
 636 static void pass_arg0(MacroAssembler* masm, Register arg) {
 637   if (c_rarg0 != arg ) {
 638     masm->mov(c_rarg0, arg);
 639   }
 640 }
 641 
 642 static void pass_arg1(MacroAssembler* masm, Register arg) {
 643   if (c_rarg1 != arg ) {
 644     masm->mov(c_rarg1, arg);
 645   }
 646 }
 647 
 648 static void pass_arg2(MacroAssembler* masm, Register arg) {
 649   if (c_rarg2 != arg ) {
 650     masm->mov(c_rarg2, arg);
 651   }
 652 }
 653 
 654 static void pass_arg3(MacroAssembler* masm, Register arg) {
 655   if (c_rarg3 != arg ) {
 656     masm->mov(c_rarg3, arg);
 657   }
 658 }
 659 
 660 void MacroAssembler::call_VM_base(Register oop_result,
 661                                   Register java_thread,
 662                                   Register last_java_sp,
 663                                   address  entry_point,
 664                                   int      number_of_arguments,
 665                                   bool     check_exceptions) {
 666    // determine java_thread register
 667   if (!java_thread->is_valid()) {
 668     java_thread = rthread;
 669   }
 670 
 671   // determine last_java_sp register
 672   if (!last_java_sp->is_valid()) {
 673     last_java_sp = esp;
 674   }
 675 
 676   // debugging support
 677   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 678   assert(java_thread == rthread, "unexpected register");
 679 #ifdef ASSERT
 680   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 681   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 682 #endif // ASSERT
 683 
 684   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 685   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 686 
 687   // push java thread (becomes first argument of C function)
 688 
 689   mov(c_rarg0, java_thread);
 690 
 691   // set last Java frame before call
 692   assert(last_java_sp != rfp, "can't use rfp");
 693 
 694   Label l;
 695   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 696 
 697   // do the call, remove parameters
 698   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 699 
 700   // reset last Java frame
 701   // Only interpreter should have to clear fp
 702   reset_last_Java_frame(true);
 703 
 704    // C++ interp handles this in the interpreter
 705   check_and_handle_popframe(java_thread);
 706   check_and_handle_earlyret(java_thread);
 707 
 708   if (check_exceptions) {
 709     // check for pending exceptions (java_thread is set upon return)
 710     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 711     Label ok;
 712     cbz(rscratch1, ok);
 713     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 714     br(rscratch1);
 715     bind(ok);
 716   }
 717 
 718   // get oop result if there is one and reset the value in the thread
 719   if (oop_result->is_valid()) {
 720     get_vm_result(oop_result, java_thread);
 721   }
 722 }
 723 
 724 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 725   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 726 }
 727 
 728 // Maybe emit a call via a trampoline.  If the code cache is small
 729 // trampolines won't be emitted.
 730 
 731 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 732   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 733   assert(entry.rspec().type() == relocInfo::runtime_call_type
 734          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 735          || entry.rspec().type() == relocInfo::static_call_type
 736          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 737 
 738   // We need a trampoline if branches are far.
 739   if (far_branches()) {
 740     bool in_scratch_emit_size = false;
 741 #ifdef COMPILER2
 742     // We don't want to emit a trampoline if C2 is generating dummy
 743     // code during its branch shortening phase.
 744     CompileTask* task = ciEnv::current()->task();
 745     in_scratch_emit_size =
 746       (task != NULL && is_c2_compile(task->comp_level()) &&
 747        Compile::current()->in_scratch_emit_size());
 748 #endif
 749     if (!in_scratch_emit_size) {
 750       address stub = emit_trampoline_stub(offset(), entry.target());
 751       if (stub == NULL) {
 752         return NULL; // CodeCache is full
 753       }
 754     }
 755   }
 756 
 757   if (cbuf) cbuf->set_insts_mark();
 758   relocate(entry.rspec());
 759   if (!far_branches()) {
 760     bl(entry.target());
 761   } else {
 762     bl(pc());
 763   }
 764   // just need to return a non-null address
 765   return pc();
 766 }
 767 
 768 
 769 // Emit a trampoline stub for a call to a target which is too far away.
 770 //
 771 // code sequences:
 772 //
 773 // call-site:
 774 //   branch-and-link to <destination> or <trampoline stub>
 775 //
 776 // Related trampoline stub for this call site in the stub section:
 777 //   load the call target from the constant pool
 778 //   branch (LR still points to the call site above)
 779 
 780 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 781                                              address dest) {
 782   // Max stub size: alignment nop, TrampolineStub.
 783   address stub = start_a_stub(NativeInstruction::instruction_size
 784                    + NativeCallTrampolineStub::instruction_size);
 785   if (stub == NULL) {
 786     return NULL;  // CodeBuffer::expand failed
 787   }
 788 
 789   // Create a trampoline stub relocation which relates this trampoline stub
 790   // with the call instruction at insts_call_instruction_offset in the
 791   // instructions code-section.
 792   align(wordSize);
 793   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 794                                             + insts_call_instruction_offset));
 795   const int stub_start_offset = offset();
 796 
 797   // Now, create the trampoline stub's code:
 798   // - load the call
 799   // - call
 800   Label target;
 801   ldr(rscratch1, target);
 802   br(rscratch1);
 803   bind(target);
 804   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 805          "should be");
 806   emit_int64((int64_t)dest);
 807 
 808   const address stub_start_addr = addr_at(stub_start_offset);
 809 
 810   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 811 
 812   end_a_stub();
 813   return stub_start_addr;
 814 }
 815 
 816 void MacroAssembler::emit_static_call_stub() {
 817   // CompiledDirectStaticCall::set_to_interpreted knows the
 818   // exact layout of this stub.
 819 
 820   isb();
 821   mov_metadata(rmethod, (Metadata*)NULL);
 822 
 823   // Jump to the entry point of the i2c stub.
 824   movptr(rscratch1, 0);
 825   br(rscratch1);
 826 }
 827 
 828 void MacroAssembler::c2bool(Register x) {
 829   // implements x == 0 ? 0 : 1
 830   // note: must only look at least-significant byte of x
 831   //       since C-style booleans are stored in one byte
 832   //       only! (was bug)
 833   tst(x, 0xff);
 834   cset(x, Assembler::NE);
 835 }
 836 
 837 address MacroAssembler::ic_call(address entry, jint method_index) {
 838   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 839   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 840   // unsigned long offset;
 841   // ldr_constant(rscratch2, const_ptr);
 842   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 843   return trampoline_call(Address(entry, rh));
 844 }
 845 
 846 // Implementation of call_VM versions
 847 
 848 void MacroAssembler::call_VM(Register oop_result,
 849                              address entry_point,
 850                              bool check_exceptions) {
 851   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 852 }
 853 
 854 void MacroAssembler::call_VM(Register oop_result,
 855                              address entry_point,
 856                              Register arg_1,
 857                              bool check_exceptions) {
 858   pass_arg1(this, arg_1);
 859   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 860 }
 861 
 862 void MacroAssembler::call_VM(Register oop_result,
 863                              address entry_point,
 864                              Register arg_1,
 865                              Register arg_2,
 866                              bool check_exceptions) {
 867   assert(arg_1 != c_rarg2, "smashed arg");
 868   pass_arg2(this, arg_2);
 869   pass_arg1(this, arg_1);
 870   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 871 }
 872 
 873 void MacroAssembler::call_VM(Register oop_result,
 874                              address entry_point,
 875                              Register arg_1,
 876                              Register arg_2,
 877                              Register arg_3,
 878                              bool check_exceptions) {
 879   assert(arg_1 != c_rarg3, "smashed arg");
 880   assert(arg_2 != c_rarg3, "smashed arg");
 881   pass_arg3(this, arg_3);
 882 
 883   assert(arg_1 != c_rarg2, "smashed arg");
 884   pass_arg2(this, arg_2);
 885 
 886   pass_arg1(this, arg_1);
 887   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 888 }
 889 
 890 void MacroAssembler::call_VM(Register oop_result,
 891                              Register last_java_sp,
 892                              address entry_point,
 893                              int number_of_arguments,
 894                              bool check_exceptions) {
 895   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 896 }
 897 
 898 void MacroAssembler::call_VM(Register oop_result,
 899                              Register last_java_sp,
 900                              address entry_point,
 901                              Register arg_1,
 902                              bool check_exceptions) {
 903   pass_arg1(this, arg_1);
 904   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 905 }
 906 
 907 void MacroAssembler::call_VM(Register oop_result,
 908                              Register last_java_sp,
 909                              address entry_point,
 910                              Register arg_1,
 911                              Register arg_2,
 912                              bool check_exceptions) {
 913 
 914   assert(arg_1 != c_rarg2, "smashed arg");
 915   pass_arg2(this, arg_2);
 916   pass_arg1(this, arg_1);
 917   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 918 }
 919 
 920 void MacroAssembler::call_VM(Register oop_result,
 921                              Register last_java_sp,
 922                              address entry_point,
 923                              Register arg_1,
 924                              Register arg_2,
 925                              Register arg_3,
 926                              bool check_exceptions) {
 927   assert(arg_1 != c_rarg3, "smashed arg");
 928   assert(arg_2 != c_rarg3, "smashed arg");
 929   pass_arg3(this, arg_3);
 930   assert(arg_1 != c_rarg2, "smashed arg");
 931   pass_arg2(this, arg_2);
 932   pass_arg1(this, arg_1);
 933   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 934 }
 935 
 936 
 937 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 938   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 939   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 940   verify_oop(oop_result, "broken oop in call_VM_base");
 941 }
 942 
 943 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 944   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 945   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 946 }
 947 
 948 void MacroAssembler::align(int modulus) {
 949   while (offset() % modulus != 0) nop();
 950 }
 951 
 952 // these are no-ops overridden by InterpreterMacroAssembler
 953 
 954 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 955 
 956 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 957 
 958 
 959 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 960                                                       Register tmp,
 961                                                       int offset) {
 962   intptr_t value = *delayed_value_addr;
 963   if (value != 0)
 964     return RegisterOrConstant(value + offset);
 965 
 966   // load indirectly to solve generation ordering problem
 967   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 968 
 969   if (offset != 0)
 970     add(tmp, tmp, offset);
 971 
 972   return RegisterOrConstant(tmp);
 973 }
 974 
 975 
 976 void MacroAssembler:: notify(int type) {
 977   if (type == bytecode_start) {
 978     // set_last_Java_frame(esp, rfp, (address)NULL);
 979     Assembler:: notify(type);
 980     // reset_last_Java_frame(true);
 981   }
 982   else
 983     Assembler:: notify(type);
 984 }
 985 
 986 // Look up the method for a megamorphic invokeinterface call.
 987 // The target method is determined by <intf_klass, itable_index>.
 988 // The receiver klass is in recv_klass.
 989 // On success, the result will be in method_result, and execution falls through.
 990 // On failure, execution transfers to the given label.
 991 void MacroAssembler::lookup_interface_method(Register recv_klass,
 992                                              Register intf_klass,
 993                                              RegisterOrConstant itable_index,
 994                                              Register method_result,
 995                                              Register scan_temp,
 996                                              Label& L_no_such_interface,
 997                          bool return_method) {
 998   assert_different_registers(recv_klass, intf_klass, scan_temp);
 999   assert_different_registers(method_result, intf_klass, scan_temp);
1000   assert(recv_klass != method_result || !return_method,
1001      "recv_klass can be destroyed when method isn't needed");
1002   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1003          "caller must use same register for non-constant itable index as for method");
1004 
1005   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
1006   int vtable_base = in_bytes(Klass::vtable_start_offset());
1007   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1008   int scan_step   = itableOffsetEntry::size() * wordSize;
1009   int vte_size    = vtableEntry::size_in_bytes();
1010   assert(vte_size == wordSize, "else adjust times_vte_scale");
1011 
1012   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1013 
1014   // %%% Could store the aligned, prescaled offset in the klassoop.
1015   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1016   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1017   add(scan_temp, scan_temp, vtable_base);
1018 
1019   if (return_method) {
1020     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1021     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1022     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1023     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1024     if (itentry_off)
1025       add(recv_klass, recv_klass, itentry_off);
1026   }
1027 
1028   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1029   //   if (scan->interface() == intf) {
1030   //     result = (klass + scan->offset() + itable_index);
1031   //   }
1032   // }
1033   Label search, found_method;
1034 
1035   for (int peel = 1; peel >= 0; peel--) {
1036     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1037     cmp(intf_klass, method_result);
1038 
1039     if (peel) {
1040       br(Assembler::EQ, found_method);
1041     } else {
1042       br(Assembler::NE, search);
1043       // (invert the test to fall through to found_method...)
1044     }
1045 
1046     if (!peel)  break;
1047 
1048     bind(search);
1049 
1050     // Check that the previous entry is non-null.  A null entry means that
1051     // the receiver class doesn't implement the interface, and wasn't the
1052     // same as when the caller was compiled.
1053     cbz(method_result, L_no_such_interface);
1054     add(scan_temp, scan_temp, scan_step);
1055   }
1056 
1057   bind(found_method);
1058 
1059   // Got a hit.
1060   if (return_method) {
1061     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1062     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1063   }
1064 }
1065 
1066 // virtual method calling
1067 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1068                                            RegisterOrConstant vtable_index,
1069                                            Register method_result) {
1070   const int base = in_bytes(Klass::vtable_start_offset());
1071   assert(vtableEntry::size() * wordSize == 8,
1072          "adjust the scaling in the code below");
1073   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1074 
1075   if (vtable_index.is_register()) {
1076     lea(method_result, Address(recv_klass,
1077                                vtable_index.as_register(),
1078                                Address::lsl(LogBytesPerWord)));
1079     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1080   } else {
1081     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1082     ldr(method_result,
1083         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1084   }
1085 }
1086 
1087 void MacroAssembler::check_klass_subtype(Register sub_klass,
1088                            Register super_klass,
1089                            Register temp_reg,
1090                            Label& L_success) {
1091   Label L_failure;
1092   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1093   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1094   bind(L_failure);
1095 }
1096 
1097 
1098 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1099                                                    Register super_klass,
1100                                                    Register temp_reg,
1101                                                    Label* L_success,
1102                                                    Label* L_failure,
1103                                                    Label* L_slow_path,
1104                                         RegisterOrConstant super_check_offset) {
1105   assert_different_registers(sub_klass, super_klass, temp_reg);
1106   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1107   if (super_check_offset.is_register()) {
1108     assert_different_registers(sub_klass, super_klass,
1109                                super_check_offset.as_register());
1110   } else if (must_load_sco) {
1111     assert(temp_reg != noreg, "supply either a temp or a register offset");
1112   }
1113 
1114   Label L_fallthrough;
1115   int label_nulls = 0;
1116   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1117   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1118   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1119   assert(label_nulls <= 1, "at most one NULL in the batch");
1120 
1121   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1122   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1123   Address super_check_offset_addr(super_klass, sco_offset);
1124 
1125   // Hacked jmp, which may only be used just before L_fallthrough.
1126 #define final_jmp(label)                                                \
1127   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1128   else                            b(label)                /*omit semi*/
1129 
1130   // If the pointers are equal, we are done (e.g., String[] elements).
1131   // This self-check enables sharing of secondary supertype arrays among
1132   // non-primary types such as array-of-interface.  Otherwise, each such
1133   // type would need its own customized SSA.
1134   // We move this check to the front of the fast path because many
1135   // type checks are in fact trivially successful in this manner,
1136   // so we get a nicely predicted branch right at the start of the check.
1137   cmp(sub_klass, super_klass);
1138   br(Assembler::EQ, *L_success);
1139 
1140   // Check the supertype display:
1141   if (must_load_sco) {
1142     ldrw(temp_reg, super_check_offset_addr);
1143     super_check_offset = RegisterOrConstant(temp_reg);
1144   }
1145   Address super_check_addr(sub_klass, super_check_offset);
1146   ldr(rscratch1, super_check_addr);
1147   cmp(super_klass, rscratch1); // load displayed supertype
1148 
1149   // This check has worked decisively for primary supers.
1150   // Secondary supers are sought in the super_cache ('super_cache_addr').
1151   // (Secondary supers are interfaces and very deeply nested subtypes.)
1152   // This works in the same check above because of a tricky aliasing
1153   // between the super_cache and the primary super display elements.
1154   // (The 'super_check_addr' can address either, as the case requires.)
1155   // Note that the cache is updated below if it does not help us find
1156   // what we need immediately.
1157   // So if it was a primary super, we can just fail immediately.
1158   // Otherwise, it's the slow path for us (no success at this point).
1159 
1160   if (super_check_offset.is_register()) {
1161     br(Assembler::EQ, *L_success);
1162     subs(zr, super_check_offset.as_register(), sc_offset);
1163     if (L_failure == &L_fallthrough) {
1164       br(Assembler::EQ, *L_slow_path);
1165     } else {
1166       br(Assembler::NE, *L_failure);
1167       final_jmp(*L_slow_path);
1168     }
1169   } else if (super_check_offset.as_constant() == sc_offset) {
1170     // Need a slow path; fast failure is impossible.
1171     if (L_slow_path == &L_fallthrough) {
1172       br(Assembler::EQ, *L_success);
1173     } else {
1174       br(Assembler::NE, *L_slow_path);
1175       final_jmp(*L_success);
1176     }
1177   } else {
1178     // No slow path; it's a fast decision.
1179     if (L_failure == &L_fallthrough) {
1180       br(Assembler::EQ, *L_success);
1181     } else {
1182       br(Assembler::NE, *L_failure);
1183       final_jmp(*L_success);
1184     }
1185   }
1186 
1187   bind(L_fallthrough);
1188 
1189 #undef final_jmp
1190 }
1191 
1192 // These two are taken from x86, but they look generally useful
1193 
1194 // scans count pointer sized words at [addr] for occurence of value,
1195 // generic
1196 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1197                                 Register scratch) {
1198   Label Lloop, Lexit;
1199   cbz(count, Lexit);
1200   bind(Lloop);
1201   ldr(scratch, post(addr, wordSize));
1202   cmp(value, scratch);
1203   br(EQ, Lexit);
1204   sub(count, count, 1);
1205   cbnz(count, Lloop);
1206   bind(Lexit);
1207 }
1208 
1209 // scans count 4 byte words at [addr] for occurence of value,
1210 // generic
1211 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1212                                 Register scratch) {
1213   Label Lloop, Lexit;
1214   cbz(count, Lexit);
1215   bind(Lloop);
1216   ldrw(scratch, post(addr, wordSize));
1217   cmpw(value, scratch);
1218   br(EQ, Lexit);
1219   sub(count, count, 1);
1220   cbnz(count, Lloop);
1221   bind(Lexit);
1222 }
1223 
1224 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1225                                                    Register super_klass,
1226                                                    Register temp_reg,
1227                                                    Register temp2_reg,
1228                                                    Label* L_success,
1229                                                    Label* L_failure,
1230                                                    bool set_cond_codes) {
1231   assert_different_registers(sub_klass, super_klass, temp_reg);
1232   if (temp2_reg != noreg)
1233     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1234 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1235 
1236   Label L_fallthrough;
1237   int label_nulls = 0;
1238   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1239   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1240   assert(label_nulls <= 1, "at most one NULL in the batch");
1241 
1242   // a couple of useful fields in sub_klass:
1243   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1244   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1245   Address secondary_supers_addr(sub_klass, ss_offset);
1246   Address super_cache_addr(     sub_klass, sc_offset);
1247 
1248   BLOCK_COMMENT("check_klass_subtype_slow_path");
1249 
1250   // Do a linear scan of the secondary super-klass chain.
1251   // This code is rarely used, so simplicity is a virtue here.
1252   // The repne_scan instruction uses fixed registers, which we must spill.
1253   // Don't worry too much about pre-existing connections with the input regs.
1254 
1255   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1256   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1257 
1258   RegSet pushed_registers;
1259   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1260   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1261 
1262   if (super_klass != r0 || UseCompressedOops) {
1263     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1264   }
1265 
1266   push(pushed_registers, sp);
1267 
1268   // Get super_klass value into r0 (even if it was in r5 or r2).
1269   if (super_klass != r0) {
1270     mov(r0, super_klass);
1271   }
1272 
1273 #ifndef PRODUCT
1274   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1275   Address pst_counter_addr(rscratch2);
1276   ldr(rscratch1, pst_counter_addr);
1277   add(rscratch1, rscratch1, 1);
1278   str(rscratch1, pst_counter_addr);
1279 #endif //PRODUCT
1280 
1281   // We will consult the secondary-super array.
1282   ldr(r5, secondary_supers_addr);
1283   // Load the array length.
1284   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1285   // Skip to start of data.
1286   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1287 
1288   cmp(sp, zr); // Clear Z flag; SP is never zero
1289   // Scan R2 words at [R5] for an occurrence of R0.
1290   // Set NZ/Z based on last compare.
1291   repne_scan(r5, r0, r2, rscratch1);
1292 
1293   // Unspill the temp. registers:
1294   pop(pushed_registers, sp);
1295 
1296   br(Assembler::NE, *L_failure);
1297 
1298   // Success.  Cache the super we found and proceed in triumph.
1299   str(super_klass, super_cache_addr);
1300 
1301   if (L_success != &L_fallthrough) {
1302     b(*L_success);
1303   }
1304 
1305 #undef IS_A_TEMP
1306 
1307   bind(L_fallthrough);
1308 }
1309 
1310 
1311 void MacroAssembler::verify_oop(Register reg, const char* s) {
1312   if (!VerifyOops || VerifyAdapterSharing) {
1313     // Below address of the code string confuses VerifyAdapterSharing
1314     // because it may differ between otherwise equivalent adapters.
1315     return;
1316   }
1317 
1318   // Pass register number to verify_oop_subroutine
1319   const char* b = NULL;
1320   {
1321     ResourceMark rm;
1322     stringStream ss;
1323     ss.print("verify_oop: %s: %s", reg->name(), s);
1324     b = code_string(ss.as_string());
1325   }
1326   BLOCK_COMMENT("verify_oop {");
1327 
1328   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1329   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1330 
1331   mov(r0, reg);
1332   mov(rscratch1, (address)b);
1333 
1334   // call indirectly to solve generation ordering problem
1335   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1336   ldr(rscratch2, Address(rscratch2));
1337   blr(rscratch2);
1338 
1339   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1340   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1341 
1342   BLOCK_COMMENT("} verify_oop");
1343 }
1344 
1345 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1346   if (!VerifyOops || VerifyAdapterSharing) {
1347     // Below address of the code string confuses VerifyAdapterSharing
1348     // because it may differ between otherwise equivalent adapters.
1349     return;
1350   }
1351 
1352   const char* b = NULL;
1353   {
1354     ResourceMark rm;
1355     stringStream ss;
1356     ss.print("verify_oop_addr: %s", s);
1357     b = code_string(ss.as_string());
1358   }
1359   BLOCK_COMMENT("verify_oop_addr {");
1360 
1361   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1362   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1363 
1364   // addr may contain sp so we will have to adjust it based on the
1365   // pushes that we just did.
1366   if (addr.uses(sp)) {
1367     lea(r0, addr);
1368     ldr(r0, Address(r0, 4 * wordSize));
1369   } else {
1370     ldr(r0, addr);
1371   }
1372   mov(rscratch1, (address)b);
1373 
1374   // call indirectly to solve generation ordering problem
1375   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1376   ldr(rscratch2, Address(rscratch2));
1377   blr(rscratch2);
1378 
1379   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1380   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1381 
1382   BLOCK_COMMENT("} verify_oop_addr");
1383 }
1384 
1385 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1386                                          int extra_slot_offset) {
1387   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1388   int stackElementSize = Interpreter::stackElementSize;
1389   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1390 #ifdef ASSERT
1391   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1392   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1393 #endif
1394   if (arg_slot.is_constant()) {
1395     return Address(esp, arg_slot.as_constant() * stackElementSize
1396                    + offset);
1397   } else {
1398     add(rscratch1, esp, arg_slot.as_register(),
1399         ext::uxtx, exact_log2(stackElementSize));
1400     return Address(rscratch1, offset);
1401   }
1402 }
1403 
1404 void MacroAssembler::call_VM_leaf_base(address entry_point,
1405                                        int number_of_arguments,
1406                                        Label *retaddr) {
1407   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1408 }
1409 
1410 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1411                                         int number_of_gp_arguments,
1412                                         int number_of_fp_arguments,
1413                                         ret_type type,
1414                                         Label *retaddr) {
1415   Label E, L;
1416 
1417   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1418 
1419   // We add 1 to number_of_arguments because the thread in arg0 is
1420   // not counted
1421   mov(rscratch1, entry_point);
1422   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1423   if (retaddr)
1424     bind(*retaddr);
1425 
1426   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1427   maybe_isb();
1428 }
1429 
1430 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1431   call_VM_leaf_base(entry_point, number_of_arguments);
1432 }
1433 
1434 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1435   pass_arg0(this, arg_0);
1436   call_VM_leaf_base(entry_point, 1);
1437 }
1438 
1439 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1440   pass_arg0(this, arg_0);
1441   pass_arg1(this, arg_1);
1442   call_VM_leaf_base(entry_point, 2);
1443 }
1444 
1445 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1446                                   Register arg_1, Register arg_2) {
1447   pass_arg0(this, arg_0);
1448   pass_arg1(this, arg_1);
1449   pass_arg2(this, arg_2);
1450   call_VM_leaf_base(entry_point, 3);
1451 }
1452 
1453 void MacroAssembler::super_call_VM_leaf(address entry_point) {
1454   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1455 }
1456 
1457 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1458   pass_arg0(this, arg_0);
1459   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1460 }
1461 
1462 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1463 
1464   assert(arg_0 != c_rarg1, "smashed arg");
1465   pass_arg1(this, arg_1);
1466   pass_arg0(this, arg_0);
1467   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1468 }
1469 
1470 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1471   assert(arg_0 != c_rarg2, "smashed arg");
1472   assert(arg_1 != c_rarg2, "smashed arg");
1473   pass_arg2(this, arg_2);
1474   assert(arg_0 != c_rarg1, "smashed arg");
1475   pass_arg1(this, arg_1);
1476   pass_arg0(this, arg_0);
1477   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1478 }
1479 
1480 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1481   assert(arg_0 != c_rarg3, "smashed arg");
1482   assert(arg_1 != c_rarg3, "smashed arg");
1483   assert(arg_2 != c_rarg3, "smashed arg");
1484   pass_arg3(this, arg_3);
1485   assert(arg_0 != c_rarg2, "smashed arg");
1486   assert(arg_1 != c_rarg2, "smashed arg");
1487   pass_arg2(this, arg_2);
1488   assert(arg_0 != c_rarg1, "smashed arg");
1489   pass_arg1(this, arg_1);
1490   pass_arg0(this, arg_0);
1491   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1492 }
1493 
1494 void MacroAssembler::null_check(Register reg, int offset) {
1495   if (needs_explicit_null_check(offset)) {
1496     // provoke OS NULL exception if reg = NULL by
1497     // accessing M[reg] w/o changing any registers
1498     // NOTE: this is plenty to provoke a segv
1499     ldr(zr, Address(reg));
1500   } else {
1501     // nothing to do, (later) access of M[reg + offset]
1502     // will provoke OS NULL exception if reg = NULL
1503   }
1504 }
1505 
1506 void MacroAssembler::test_klass_is_value(Register klass, Register temp_reg, Label& is_value) {
1507   ldrw(temp_reg, Address(klass, Klass::access_flags_offset()));
1508   andr(temp_reg, temp_reg, JVM_ACC_VALUE);
1509   cbnz(temp_reg, is_value); 
1510 }
1511 
1512 void MacroAssembler::test_field_is_flattenable(Register flags, Register temp_reg, Label& is_flattenable) {
1513   (void) temp_reg; // keep signature uniform with x86
1514   tbnz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, is_flattenable);
1515 }
1516 
1517 void MacroAssembler::test_field_is_not_flattenable(Register flags, Register temp_reg, Label& not_flattenable) {
1518   (void) temp_reg; // keep signature uniform with x86
1519   tbz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, not_flattenable);
1520 }
1521 
1522 void MacroAssembler::test_field_is_flattened(Register flags, Register temp_reg, Label& is_flattened) {
1523   (void) temp_reg; // keep signature uniform with x86
1524   tbnz(flags, ConstantPoolCacheEntry::is_flattened_field_shift, is_flattened);
1525 }
1526 
1527 void MacroAssembler::test_flattened_array_oop(Register oop, Register temp_reg, Label& is_flattened_array) {
1528   load_storage_props(temp_reg, oop);
1529   andr(temp_reg, temp_reg, ArrayStorageProperties::flattened_value);
1530   cbnz(temp_reg, is_flattened_array);
1531 }
1532 
1533 void MacroAssembler::test_null_free_array_oop(Register oop, Register temp_reg, Label& is_null_free_array) {
1534   load_storage_props(temp_reg, oop);
1535   andr(temp_reg, temp_reg, ArrayStorageProperties::null_free_value);
1536   cbnz(temp_reg, is_null_free_array);
1537 }
1538 
1539 // MacroAssembler protected routines needed to implement
1540 // public methods
1541 
1542 void MacroAssembler::mov(Register r, Address dest) {
1543   code_section()->relocate(pc(), dest.rspec());
1544   u_int64_t imm64 = (u_int64_t)dest.target();
1545   movptr(r, imm64);
1546 }
1547 
1548 // Move a constant pointer into r.  In AArch64 mode the virtual
1549 // address space is 48 bits in size, so we only need three
1550 // instructions to create a patchable instruction sequence that can
1551 // reach anywhere.
1552 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1553 #ifndef PRODUCT
1554   {
1555     char buffer[64];
1556     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1557     block_comment(buffer);
1558   }
1559 #endif
1560   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1561   movz(r, imm64 & 0xffff);
1562   imm64 >>= 16;
1563   movk(r, imm64 & 0xffff, 16);
1564   imm64 >>= 16;
1565   movk(r, imm64 & 0xffff, 32);
1566 }
1567 
1568 // Macro to mov replicated immediate to vector register.
1569 //  Vd will get the following values for different arrangements in T
1570 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1571 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1572 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1573 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1574 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1575 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1576 //   T1D/T2D: invalid
1577 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1578   assert(T != T1D && T != T2D, "invalid arrangement");
1579   if (T == T8B || T == T16B) {
1580     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1581     movi(Vd, T, imm32 & 0xff, 0);
1582     return;
1583   }
1584   u_int32_t nimm32 = ~imm32;
1585   if (T == T4H || T == T8H) {
1586     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1587     imm32 &= 0xffff;
1588     nimm32 &= 0xffff;
1589   }
1590   u_int32_t x = imm32;
1591   int movi_cnt = 0;
1592   int movn_cnt = 0;
1593   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1594   x = nimm32;
1595   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1596   if (movn_cnt < movi_cnt) imm32 = nimm32;
1597   unsigned lsl = 0;
1598   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1599   if (movn_cnt < movi_cnt)
1600     mvni(Vd, T, imm32 & 0xff, lsl);
1601   else
1602     movi(Vd, T, imm32 & 0xff, lsl);
1603   imm32 >>= 8; lsl += 8;
1604   while (imm32) {
1605     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1606     if (movn_cnt < movi_cnt)
1607       bici(Vd, T, imm32 & 0xff, lsl);
1608     else
1609       orri(Vd, T, imm32 & 0xff, lsl);
1610     lsl += 8; imm32 >>= 8;
1611   }
1612 }
1613 
1614 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1615 {
1616 #ifndef PRODUCT
1617   {
1618     char buffer[64];
1619     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1620     block_comment(buffer);
1621   }
1622 #endif
1623   if (operand_valid_for_logical_immediate(false, imm64)) {
1624     orr(dst, zr, imm64);
1625   } else {
1626     // we can use a combination of MOVZ or MOVN with
1627     // MOVK to build up the constant
1628     u_int64_t imm_h[4];
1629     int zero_count = 0;
1630     int neg_count = 0;
1631     int i;
1632     for (i = 0; i < 4; i++) {
1633       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1634       if (imm_h[i] == 0) {
1635         zero_count++;
1636       } else if (imm_h[i] == 0xffffL) {
1637         neg_count++;
1638       }
1639     }
1640     if (zero_count == 4) {
1641       // one MOVZ will do
1642       movz(dst, 0);
1643     } else if (neg_count == 4) {
1644       // one MOVN will do
1645       movn(dst, 0);
1646     } else if (zero_count == 3) {
1647       for (i = 0; i < 4; i++) {
1648         if (imm_h[i] != 0L) {
1649           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1650           break;
1651         }
1652       }
1653     } else if (neg_count == 3) {
1654       // one MOVN will do
1655       for (int i = 0; i < 4; i++) {
1656         if (imm_h[i] != 0xffffL) {
1657           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1658           break;
1659         }
1660       }
1661     } else if (zero_count == 2) {
1662       // one MOVZ and one MOVK will do
1663       for (i = 0; i < 3; i++) {
1664         if (imm_h[i] != 0L) {
1665           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1666           i++;
1667           break;
1668         }
1669       }
1670       for (;i < 4; i++) {
1671         if (imm_h[i] != 0L) {
1672           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1673         }
1674       }
1675     } else if (neg_count == 2) {
1676       // one MOVN and one MOVK will do
1677       for (i = 0; i < 4; i++) {
1678         if (imm_h[i] != 0xffffL) {
1679           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1680           i++;
1681           break;
1682         }
1683       }
1684       for (;i < 4; i++) {
1685         if (imm_h[i] != 0xffffL) {
1686           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1687         }
1688       }
1689     } else if (zero_count == 1) {
1690       // one MOVZ and two MOVKs will do
1691       for (i = 0; i < 4; i++) {
1692         if (imm_h[i] != 0L) {
1693           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1694           i++;
1695           break;
1696         }
1697       }
1698       for (;i < 4; i++) {
1699         if (imm_h[i] != 0x0L) {
1700           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1701         }
1702       }
1703     } else if (neg_count == 1) {
1704       // one MOVN and two MOVKs will do
1705       for (i = 0; i < 4; i++) {
1706         if (imm_h[i] != 0xffffL) {
1707           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1708           i++;
1709           break;
1710         }
1711       }
1712       for (;i < 4; i++) {
1713         if (imm_h[i] != 0xffffL) {
1714           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1715         }
1716       }
1717     } else {
1718       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1719       movz(dst, (u_int32_t)imm_h[0], 0);
1720       for (i = 1; i < 4; i++) {
1721         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1722       }
1723     }
1724   }
1725 }
1726 
1727 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1728 {
1729 #ifndef PRODUCT
1730     {
1731       char buffer[64];
1732       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1733       block_comment(buffer);
1734     }
1735 #endif
1736   if (operand_valid_for_logical_immediate(true, imm32)) {
1737     orrw(dst, zr, imm32);
1738   } else {
1739     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1740     // constant
1741     u_int32_t imm_h[2];
1742     imm_h[0] = imm32 & 0xffff;
1743     imm_h[1] = ((imm32 >> 16) & 0xffff);
1744     if (imm_h[0] == 0) {
1745       movzw(dst, imm_h[1], 16);
1746     } else if (imm_h[0] == 0xffff) {
1747       movnw(dst, imm_h[1] ^ 0xffff, 16);
1748     } else if (imm_h[1] == 0) {
1749       movzw(dst, imm_h[0], 0);
1750     } else if (imm_h[1] == 0xffff) {
1751       movnw(dst, imm_h[0] ^ 0xffff, 0);
1752     } else {
1753       // use a MOVZ and MOVK (makes it easier to debug)
1754       movzw(dst, imm_h[0], 0);
1755       movkw(dst, imm_h[1], 16);
1756     }
1757   }
1758 }
1759 
1760 // Form an address from base + offset in Rd.  Rd may or may
1761 // not actually be used: you must use the Address that is returned.
1762 // It is up to you to ensure that the shift provided matches the size
1763 // of your data.
1764 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1765   if (Address::offset_ok_for_immed(byte_offset, shift))
1766     // It fits; no need for any heroics
1767     return Address(base, byte_offset);
1768 
1769   // Don't do anything clever with negative or misaligned offsets
1770   unsigned mask = (1 << shift) - 1;
1771   if (byte_offset < 0 || byte_offset & mask) {
1772     mov(Rd, byte_offset);
1773     add(Rd, base, Rd);
1774     return Address(Rd);
1775   }
1776 
1777   // See if we can do this with two 12-bit offsets
1778   {
1779     unsigned long word_offset = byte_offset >> shift;
1780     unsigned long masked_offset = word_offset & 0xfff000;
1781     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1782         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1783       add(Rd, base, masked_offset << shift);
1784       word_offset -= masked_offset;
1785       return Address(Rd, word_offset << shift);
1786     }
1787   }
1788 
1789   // Do it the hard way
1790   mov(Rd, byte_offset);
1791   add(Rd, base, Rd);
1792   return Address(Rd);
1793 }
1794 
1795 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1796   if (UseLSE) {
1797     mov(tmp, 1);
1798     ldadd(Assembler::word, tmp, zr, counter_addr);
1799     return;
1800   }
1801   Label retry_load;
1802   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1803     prfm(Address(counter_addr), PSTL1STRM);
1804   bind(retry_load);
1805   // flush and load exclusive from the memory location
1806   ldxrw(tmp, counter_addr);
1807   addw(tmp, tmp, 1);
1808   // if we store+flush with no intervening write tmp wil be zero
1809   stxrw(tmp2, tmp, counter_addr);
1810   cbnzw(tmp2, retry_load);
1811 }
1812 
1813 
1814 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1815                                     bool want_remainder, Register scratch)
1816 {
1817   // Full implementation of Java idiv and irem.  The function
1818   // returns the (pc) offset of the div instruction - may be needed
1819   // for implicit exceptions.
1820   //
1821   // constraint : ra/rb =/= scratch
1822   //         normal case
1823   //
1824   // input : ra: dividend
1825   //         rb: divisor
1826   //
1827   // result: either
1828   //         quotient  (= ra idiv rb)
1829   //         remainder (= ra irem rb)
1830 
1831   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1832 
1833   int idivl_offset = offset();
1834   if (! want_remainder) {
1835     sdivw(result, ra, rb);
1836   } else {
1837     sdivw(scratch, ra, rb);
1838     Assembler::msubw(result, scratch, rb, ra);
1839   }
1840 
1841   return idivl_offset;
1842 }
1843 
1844 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1845                                     bool want_remainder, Register scratch)
1846 {
1847   // Full implementation of Java ldiv and lrem.  The function
1848   // returns the (pc) offset of the div instruction - may be needed
1849   // for implicit exceptions.
1850   //
1851   // constraint : ra/rb =/= scratch
1852   //         normal case
1853   //
1854   // input : ra: dividend
1855   //         rb: divisor
1856   //
1857   // result: either
1858   //         quotient  (= ra idiv rb)
1859   //         remainder (= ra irem rb)
1860 
1861   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1862 
1863   int idivq_offset = offset();
1864   if (! want_remainder) {
1865     sdiv(result, ra, rb);
1866   } else {
1867     sdiv(scratch, ra, rb);
1868     Assembler::msub(result, scratch, rb, ra);
1869   }
1870 
1871   return idivq_offset;
1872 }
1873 
1874 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1875   address prev = pc() - NativeMembar::instruction_size;
1876   address last = code()->last_insn();
1877   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1878     NativeMembar *bar = NativeMembar_at(prev);
1879     // We are merging two memory barrier instructions.  On AArch64 we
1880     // can do this simply by ORing them together.
1881     bar->set_kind(bar->get_kind() | order_constraint);
1882     BLOCK_COMMENT("merged membar");
1883   } else {
1884     code()->set_last_insn(pc());
1885     dmb(Assembler::barrier(order_constraint));
1886   }
1887 }
1888 
1889 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1890   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1891     merge_ldst(rt, adr, size_in_bytes, is_store);
1892     code()->clear_last_insn();
1893     return true;
1894   } else {
1895     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1896     const unsigned mask = size_in_bytes - 1;
1897     if (adr.getMode() == Address::base_plus_offset &&
1898         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1899       code()->set_last_insn(pc());
1900     }
1901     return false;
1902   }
1903 }
1904 
1905 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1906   // We always try to merge two adjacent loads into one ldp.
1907   if (!try_merge_ldst(Rx, adr, 8, false)) {
1908     Assembler::ldr(Rx, adr);
1909   }
1910 }
1911 
1912 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1913   // We always try to merge two adjacent loads into one ldp.
1914   if (!try_merge_ldst(Rw, adr, 4, false)) {
1915     Assembler::ldrw(Rw, adr);
1916   }
1917 }
1918 
1919 void MacroAssembler::str(Register Rx, const Address &adr) {
1920   // We always try to merge two adjacent stores into one stp.
1921   if (!try_merge_ldst(Rx, adr, 8, true)) {
1922     Assembler::str(Rx, adr);
1923   }
1924 }
1925 
1926 void MacroAssembler::strw(Register Rw, const Address &adr) {
1927   // We always try to merge two adjacent stores into one stp.
1928   if (!try_merge_ldst(Rw, adr, 4, true)) {
1929     Assembler::strw(Rw, adr);
1930   }
1931 }
1932 
1933 // MacroAssembler routines found actually to be needed
1934 
1935 void MacroAssembler::push(Register src)
1936 {
1937   str(src, Address(pre(esp, -1 * wordSize)));
1938 }
1939 
1940 void MacroAssembler::pop(Register dst)
1941 {
1942   ldr(dst, Address(post(esp, 1 * wordSize)));
1943 }
1944 
1945 // Note: load_unsigned_short used to be called load_unsigned_word.
1946 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1947   int off = offset();
1948   ldrh(dst, src);
1949   return off;
1950 }
1951 
1952 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1953   int off = offset();
1954   ldrb(dst, src);
1955   return off;
1956 }
1957 
1958 int MacroAssembler::load_signed_short(Register dst, Address src) {
1959   int off = offset();
1960   ldrsh(dst, src);
1961   return off;
1962 }
1963 
1964 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1965   int off = offset();
1966   ldrsb(dst, src);
1967   return off;
1968 }
1969 
1970 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1971   int off = offset();
1972   ldrshw(dst, src);
1973   return off;
1974 }
1975 
1976 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1977   int off = offset();
1978   ldrsbw(dst, src);
1979   return off;
1980 }
1981 
1982 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1983   switch (size_in_bytes) {
1984   case  8:  ldr(dst, src); break;
1985   case  4:  ldrw(dst, src); break;
1986   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1987   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1988   default:  ShouldNotReachHere();
1989   }
1990 }
1991 
1992 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1993   switch (size_in_bytes) {
1994   case  8:  str(src, dst); break;
1995   case  4:  strw(src, dst); break;
1996   case  2:  strh(src, dst); break;
1997   case  1:  strb(src, dst); break;
1998   default:  ShouldNotReachHere();
1999   }
2000 }
2001 
2002 void MacroAssembler::decrementw(Register reg, int value)
2003 {
2004   if (value < 0)  { incrementw(reg, -value);      return; }
2005   if (value == 0) {                               return; }
2006   if (value < (1 << 12)) { subw(reg, reg, value); return; }
2007   /* else */ {
2008     guarantee(reg != rscratch2, "invalid dst for register decrement");
2009     movw(rscratch2, (unsigned)value);
2010     subw(reg, reg, rscratch2);
2011   }
2012 }
2013 
2014 void MacroAssembler::decrement(Register reg, int value)
2015 {
2016   if (value < 0)  { increment(reg, -value);      return; }
2017   if (value == 0) {                              return; }
2018   if (value < (1 << 12)) { sub(reg, reg, value); return; }
2019   /* else */ {
2020     assert(reg != rscratch2, "invalid dst for register decrement");
2021     mov(rscratch2, (unsigned long)value);
2022     sub(reg, reg, rscratch2);
2023   }
2024 }
2025 
2026 void MacroAssembler::decrementw(Address dst, int value)
2027 {
2028   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
2029   if (dst.getMode() == Address::literal) {
2030     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2031     lea(rscratch2, dst);
2032     dst = Address(rscratch2);
2033   }
2034   ldrw(rscratch1, dst);
2035   decrementw(rscratch1, value);
2036   strw(rscratch1, dst);
2037 }
2038 
2039 void MacroAssembler::decrement(Address dst, int value)
2040 {
2041   assert(!dst.uses(rscratch1), "invalid address for decrement");
2042   if (dst.getMode() == Address::literal) {
2043     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2044     lea(rscratch2, dst);
2045     dst = Address(rscratch2);
2046   }
2047   ldr(rscratch1, dst);
2048   decrement(rscratch1, value);
2049   str(rscratch1, dst);
2050 }
2051 
2052 void MacroAssembler::incrementw(Register reg, int value)
2053 {
2054   if (value < 0)  { decrementw(reg, -value);      return; }
2055   if (value == 0) {                               return; }
2056   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2057   /* else */ {
2058     assert(reg != rscratch2, "invalid dst for register increment");
2059     movw(rscratch2, (unsigned)value);
2060     addw(reg, reg, rscratch2);
2061   }
2062 }
2063 
2064 void MacroAssembler::increment(Register reg, int value)
2065 {
2066   if (value < 0)  { decrement(reg, -value);      return; }
2067   if (value == 0) {                              return; }
2068   if (value < (1 << 12)) { add(reg, reg, value); return; }
2069   /* else */ {
2070     assert(reg != rscratch2, "invalid dst for register increment");
2071     movw(rscratch2, (unsigned)value);
2072     add(reg, reg, rscratch2);
2073   }
2074 }
2075 
2076 void MacroAssembler::incrementw(Address dst, int value)
2077 {
2078   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2079   if (dst.getMode() == Address::literal) {
2080     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2081     lea(rscratch2, dst);
2082     dst = Address(rscratch2);
2083   }
2084   ldrw(rscratch1, dst);
2085   incrementw(rscratch1, value);
2086   strw(rscratch1, dst);
2087 }
2088 
2089 void MacroAssembler::increment(Address dst, int value)
2090 {
2091   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2092   if (dst.getMode() == Address::literal) {
2093     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2094     lea(rscratch2, dst);
2095     dst = Address(rscratch2);
2096   }
2097   ldr(rscratch1, dst);
2098   increment(rscratch1, value);
2099   str(rscratch1, dst);
2100 }
2101 
2102 
2103 void MacroAssembler::pusha() {
2104   push(0x7fffffff, sp);
2105 }
2106 
2107 void MacroAssembler::popa() {
2108   pop(0x7fffffff, sp);
2109 }
2110 
2111 // Push lots of registers in the bit set supplied.  Don't push sp.
2112 // Return the number of words pushed
2113 int MacroAssembler::push(unsigned int bitset, Register stack) {
2114   int words_pushed = 0;
2115 
2116   // Scan bitset to accumulate register pairs
2117   unsigned char regs[32];
2118   int count = 0;
2119   for (int reg = 0; reg <= 30; reg++) {
2120     if (1 & bitset)
2121       regs[count++] = reg;
2122     bitset >>= 1;
2123   }
2124   regs[count++] = zr->encoding_nocheck();
2125   count &= ~1;  // Only push an even nuber of regs
2126 
2127   if (count) {
2128     stp(as_Register(regs[0]), as_Register(regs[1]),
2129        Address(pre(stack, -count * wordSize)));
2130     words_pushed += 2;
2131   }
2132   for (int i = 2; i < count; i += 2) {
2133     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2134        Address(stack, i * wordSize));
2135     words_pushed += 2;
2136   }
2137 
2138   assert(words_pushed == count, "oops, pushed != count");
2139 
2140   return count;
2141 }
2142 
2143 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2144   int words_pushed = 0;
2145 
2146   // Scan bitset to accumulate register pairs
2147   unsigned char regs[32];
2148   int count = 0;
2149   for (int reg = 0; reg <= 30; reg++) {
2150     if (1 & bitset)
2151       regs[count++] = reg;
2152     bitset >>= 1;
2153   }
2154   regs[count++] = zr->encoding_nocheck();
2155   count &= ~1;
2156 
2157   for (int i = 2; i < count; i += 2) {
2158     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2159        Address(stack, i * wordSize));
2160     words_pushed += 2;
2161   }
2162   if (count) {
2163     ldp(as_Register(regs[0]), as_Register(regs[1]),
2164        Address(post(stack, count * wordSize)));
2165     words_pushed += 2;
2166   }
2167 
2168   assert(words_pushed == count, "oops, pushed != count");
2169 
2170   return count;
2171 }
2172 #ifdef ASSERT
2173 void MacroAssembler::verify_heapbase(const char* msg) {
2174 #if 0
2175   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2176   assert (Universe::heap() != NULL, "java heap should be initialized");
2177   if (CheckCompressedOops) {
2178     Label ok;
2179     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2180     cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2181     br(Assembler::EQ, ok);
2182     stop(msg);
2183     bind(ok);
2184     pop(1 << rscratch1->encoding(), sp);
2185   }
2186 #endif
2187 }
2188 #endif
2189 
2190 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2191   Label done, not_weak;
2192   cbz(value, done);           // Use NULL as-is.
2193 
2194   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2195   tbz(r0, 0, not_weak);    // Test for jweak tag.
2196 
2197   // Resolve jweak.
2198   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2199                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2200   verify_oop(value);
2201   b(done);
2202 
2203   bind(not_weak);
2204   // Resolve (untagged) jobject.
2205   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2206   verify_oop(value);
2207   bind(done);
2208 }
2209 
2210 void MacroAssembler::stop(const char* msg) {
2211   address ip = pc();
2212   pusha();
2213   mov(c_rarg0, (address)msg);
2214   mov(c_rarg1, (address)ip);
2215   mov(c_rarg2, sp);
2216   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2217   // call(c_rarg3);
2218   blrt(c_rarg3, 3, 0, 1);
2219   hlt(0);
2220 }
2221 
2222 void MacroAssembler::warn(const char* msg) {
2223   pusha();
2224   mov(c_rarg0, (address)msg);
2225   mov(lr, CAST_FROM_FN_PTR(address, warning));
2226   blrt(lr, 1, 0, MacroAssembler::ret_type_void);
2227   popa();
2228 }
2229 
2230 void MacroAssembler::unimplemented(const char* what) {
2231   const char* buf = NULL;
2232   {
2233     ResourceMark rm;
2234     stringStream ss;
2235     ss.print("unimplemented: %s", what);
2236     buf = code_string(ss.as_string());
2237   }
2238   stop(buf);
2239 }
2240 
2241 // If a constant does not fit in an immediate field, generate some
2242 // number of MOV instructions and then perform the operation.
2243 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2244                                            add_sub_imm_insn insn1,
2245                                            add_sub_reg_insn insn2) {
2246   assert(Rd != zr, "Rd = zr and not setting flags?");
2247   if (operand_valid_for_add_sub_immediate((int)imm)) {
2248     (this->*insn1)(Rd, Rn, imm);
2249   } else {
2250     if (uabs(imm) < (1 << 24)) {
2251        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2252        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2253     } else {
2254        assert_different_registers(Rd, Rn);
2255        mov(Rd, (uint64_t)imm);
2256        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2257     }
2258   }
2259 }
2260 
2261 // Seperate vsn which sets the flags. Optimisations are more restricted
2262 // because we must set the flags correctly.
2263 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2264                                            add_sub_imm_insn insn1,
2265                                            add_sub_reg_insn insn2) {
2266   if (operand_valid_for_add_sub_immediate((int)imm)) {
2267     (this->*insn1)(Rd, Rn, imm);
2268   } else {
2269     assert_different_registers(Rd, Rn);
2270     assert(Rd != zr, "overflow in immediate operand");
2271     mov(Rd, (uint64_t)imm);
2272     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2273   }
2274 }
2275 
2276 
2277 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2278   if (increment.is_register()) {
2279     add(Rd, Rn, increment.as_register());
2280   } else {
2281     add(Rd, Rn, increment.as_constant());
2282   }
2283 }
2284 
2285 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2286   if (increment.is_register()) {
2287     addw(Rd, Rn, increment.as_register());
2288   } else {
2289     addw(Rd, Rn, increment.as_constant());
2290   }
2291 }
2292 
2293 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2294   if (decrement.is_register()) {
2295     sub(Rd, Rn, decrement.as_register());
2296   } else {
2297     sub(Rd, Rn, decrement.as_constant());
2298   }
2299 }
2300 
2301 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2302   if (decrement.is_register()) {
2303     subw(Rd, Rn, decrement.as_register());
2304   } else {
2305     subw(Rd, Rn, decrement.as_constant());
2306   }
2307 }
2308 
2309 void MacroAssembler::reinit_heapbase()
2310 {
2311   if (UseCompressedOops) {
2312     if (Universe::is_fully_initialized()) {
2313       mov(rheapbase, CompressedOops::ptrs_base());
2314     } else {
2315       lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2316       ldr(rheapbase, Address(rheapbase));
2317     }
2318   }
2319 }
2320 
2321 // this simulates the behaviour of the x86 cmpxchg instruction using a
2322 // load linked/store conditional pair. we use the acquire/release
2323 // versions of these instructions so that we flush pending writes as
2324 // per Java semantics.
2325 
2326 // n.b the x86 version assumes the old value to be compared against is
2327 // in rax and updates rax with the value located in memory if the
2328 // cmpxchg fails. we supply a register for the old value explicitly
2329 
2330 // the aarch64 load linked/store conditional instructions do not
2331 // accept an offset. so, unlike x86, we must provide a plain register
2332 // to identify the memory word to be compared/exchanged rather than a
2333 // register+offset Address.
2334 
2335 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2336                                 Label &succeed, Label *fail) {
2337   // oldv holds comparison value
2338   // newv holds value to write in exchange
2339   // addr identifies memory word to compare against/update
2340   if (UseLSE) {
2341     mov(tmp, oldv);
2342     casal(Assembler::xword, oldv, newv, addr);
2343     cmp(tmp, oldv);
2344     br(Assembler::EQ, succeed);
2345     membar(AnyAny);
2346   } else {
2347     Label retry_load, nope;
2348     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2349       prfm(Address(addr), PSTL1STRM);
2350     bind(retry_load);
2351     // flush and load exclusive from the memory location
2352     // and fail if it is not what we expect
2353     ldaxr(tmp, addr);
2354     cmp(tmp, oldv);
2355     br(Assembler::NE, nope);
2356     // if we store+flush with no intervening write tmp wil be zero
2357     stlxr(tmp, newv, addr);
2358     cbzw(tmp, succeed);
2359     // retry so we only ever return after a load fails to compare
2360     // ensures we don't return a stale value after a failed write.
2361     b(retry_load);
2362     // if the memory word differs we return it in oldv and signal a fail
2363     bind(nope);
2364     membar(AnyAny);
2365     mov(oldv, tmp);
2366   }
2367   if (fail)
2368     b(*fail);
2369 }
2370 
2371 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2372                                         Label &succeed, Label *fail) {
2373   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2374   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2375 }
2376 
2377 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2378                                 Label &succeed, Label *fail) {
2379   // oldv holds comparison value
2380   // newv holds value to write in exchange
2381   // addr identifies memory word to compare against/update
2382   // tmp returns 0/1 for success/failure
2383   if (UseLSE) {
2384     mov(tmp, oldv);
2385     casal(Assembler::word, oldv, newv, addr);
2386     cmp(tmp, oldv);
2387     br(Assembler::EQ, succeed);
2388     membar(AnyAny);
2389   } else {
2390     Label retry_load, nope;
2391     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2392       prfm(Address(addr), PSTL1STRM);
2393     bind(retry_load);
2394     // flush and load exclusive from the memory location
2395     // and fail if it is not what we expect
2396     ldaxrw(tmp, addr);
2397     cmp(tmp, oldv);
2398     br(Assembler::NE, nope);
2399     // if we store+flush with no intervening write tmp wil be zero
2400     stlxrw(tmp, newv, addr);
2401     cbzw(tmp, succeed);
2402     // retry so we only ever return after a load fails to compare
2403     // ensures we don't return a stale value after a failed write.
2404     b(retry_load);
2405     // if the memory word differs we return it in oldv and signal a fail
2406     bind(nope);
2407     membar(AnyAny);
2408     mov(oldv, tmp);
2409   }
2410   if (fail)
2411     b(*fail);
2412 }
2413 
2414 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2415 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2416 // Pass a register for the result, otherwise pass noreg.
2417 
2418 // Clobbers rscratch1
2419 void MacroAssembler::cmpxchg(Register addr, Register expected,
2420                              Register new_val,
2421                              enum operand_size size,
2422                              bool acquire, bool release,
2423                              bool weak,
2424                              Register result) {
2425   if (result == noreg)  result = rscratch1;
2426   BLOCK_COMMENT("cmpxchg {");
2427   if (UseLSE) {
2428     mov(result, expected);
2429     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2430     compare_eq(result, expected, size);
2431   } else {
2432     Label retry_load, done;
2433     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2434       prfm(Address(addr), PSTL1STRM);
2435     bind(retry_load);
2436     load_exclusive(result, addr, size, acquire);
2437     compare_eq(result, expected, size);
2438     br(Assembler::NE, done);
2439     store_exclusive(rscratch1, new_val, addr, size, release);
2440     if (weak) {
2441       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2442     } else {
2443       cbnzw(rscratch1, retry_load);
2444     }
2445     bind(done);
2446   }
2447   BLOCK_COMMENT("} cmpxchg");
2448 }
2449 
2450 // A generic comparison. Only compares for equality, clobbers rscratch1.
2451 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2452   if (size == xword) {
2453     cmp(rm, rn);
2454   } else if (size == word) {
2455     cmpw(rm, rn);
2456   } else if (size == halfword) {
2457     eorw(rscratch1, rm, rn);
2458     ands(zr, rscratch1, 0xffff);
2459   } else if (size == byte) {
2460     eorw(rscratch1, rm, rn);
2461     ands(zr, rscratch1, 0xff);
2462   } else {
2463     ShouldNotReachHere();
2464   }
2465 }
2466 
2467 
2468 static bool different(Register a, RegisterOrConstant b, Register c) {
2469   if (b.is_constant())
2470     return a != c;
2471   else
2472     return a != b.as_register() && a != c && b.as_register() != c;
2473 }
2474 
2475 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2476 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2477   if (UseLSE) {                                                         \
2478     prev = prev->is_valid() ? prev : zr;                                \
2479     if (incr.is_register()) {                                           \
2480       AOP(sz, incr.as_register(), prev, addr);                          \
2481     } else {                                                            \
2482       mov(rscratch2, incr.as_constant());                               \
2483       AOP(sz, rscratch2, prev, addr);                                   \
2484     }                                                                   \
2485     return;                                                             \
2486   }                                                                     \
2487   Register result = rscratch2;                                          \
2488   if (prev->is_valid())                                                 \
2489     result = different(prev, incr, addr) ? prev : rscratch2;            \
2490                                                                         \
2491   Label retry_load;                                                     \
2492   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2493     prfm(Address(addr), PSTL1STRM);                                     \
2494   bind(retry_load);                                                     \
2495   LDXR(result, addr);                                                   \
2496   OP(rscratch1, result, incr);                                          \
2497   STXR(rscratch2, rscratch1, addr);                                     \
2498   cbnzw(rscratch2, retry_load);                                         \
2499   if (prev->is_valid() && prev != result) {                             \
2500     IOP(prev, rscratch1, incr);                                         \
2501   }                                                                     \
2502 }
2503 
2504 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2505 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2506 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2507 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2508 
2509 #undef ATOMIC_OP
2510 
2511 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2512 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2513   if (UseLSE) {                                                         \
2514     prev = prev->is_valid() ? prev : zr;                                \
2515     AOP(sz, newv, prev, addr);                                          \
2516     return;                                                             \
2517   }                                                                     \
2518   Register result = rscratch2;                                          \
2519   if (prev->is_valid())                                                 \
2520     result = different(prev, newv, addr) ? prev : rscratch2;            \
2521                                                                         \
2522   Label retry_load;                                                     \
2523   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2524     prfm(Address(addr), PSTL1STRM);                                     \
2525   bind(retry_load);                                                     \
2526   LDXR(result, addr);                                                   \
2527   STXR(rscratch1, newv, addr);                                          \
2528   cbnzw(rscratch1, retry_load);                                         \
2529   if (prev->is_valid() && prev != result)                               \
2530     mov(prev, result);                                                  \
2531 }
2532 
2533 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2534 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2535 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2536 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2537 
2538 #undef ATOMIC_XCHG
2539 
2540 #ifndef PRODUCT
2541 extern "C" void findpc(intptr_t x);
2542 #endif
2543 
2544 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2545 {
2546   // In order to get locks to work, we need to fake a in_VM state
2547   if (ShowMessageBoxOnError ) {
2548     JavaThread* thread = JavaThread::current();
2549     JavaThreadState saved_state = thread->thread_state();
2550     thread->set_thread_state(_thread_in_vm);
2551 #ifndef PRODUCT
2552     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2553       ttyLocker ttyl;
2554       BytecodeCounter::print();
2555     }
2556 #endif
2557     if (os::message_box(msg, "Execution stopped, print registers?")) {
2558       ttyLocker ttyl;
2559       tty->print_cr(" pc = 0x%016lx", pc);
2560 #ifndef PRODUCT
2561       tty->cr();
2562       findpc(pc);
2563       tty->cr();
2564 #endif
2565       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2566       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2567       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2568       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2569       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2570       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2571       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2572       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2573       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2574       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2575       tty->print_cr("r10 = 0x%016lx", regs[10]);
2576       tty->print_cr("r11 = 0x%016lx", regs[11]);
2577       tty->print_cr("r12 = 0x%016lx", regs[12]);
2578       tty->print_cr("r13 = 0x%016lx", regs[13]);
2579       tty->print_cr("r14 = 0x%016lx", regs[14]);
2580       tty->print_cr("r15 = 0x%016lx", regs[15]);
2581       tty->print_cr("r16 = 0x%016lx", regs[16]);
2582       tty->print_cr("r17 = 0x%016lx", regs[17]);
2583       tty->print_cr("r18 = 0x%016lx", regs[18]);
2584       tty->print_cr("r19 = 0x%016lx", regs[19]);
2585       tty->print_cr("r20 = 0x%016lx", regs[20]);
2586       tty->print_cr("r21 = 0x%016lx", regs[21]);
2587       tty->print_cr("r22 = 0x%016lx", regs[22]);
2588       tty->print_cr("r23 = 0x%016lx", regs[23]);
2589       tty->print_cr("r24 = 0x%016lx", regs[24]);
2590       tty->print_cr("r25 = 0x%016lx", regs[25]);
2591       tty->print_cr("r26 = 0x%016lx", regs[26]);
2592       tty->print_cr("r27 = 0x%016lx", regs[27]);
2593       tty->print_cr("r28 = 0x%016lx", regs[28]);
2594       tty->print_cr("r30 = 0x%016lx", regs[30]);
2595       tty->print_cr("r31 = 0x%016lx", regs[31]);
2596       BREAKPOINT;
2597     }
2598     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2599   } else {
2600     ttyLocker ttyl;
2601     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2602                     msg);
2603     assert(false, "DEBUG MESSAGE: %s", msg);
2604   }
2605 }
2606 
2607 #ifdef BUILTIN_SIM
2608 // routine to generate an x86 prolog for a stub function which
2609 // bootstraps into the generated ARM code which directly follows the
2610 // stub
2611 //
2612 // the argument encodes the number of general and fp registers
2613 // passed by the caller and the callng convention (currently just
2614 // the number of general registers and assumes C argument passing)
2615 
2616 extern "C" {
2617 int aarch64_stub_prolog_size();
2618 void aarch64_stub_prolog();
2619 void aarch64_prolog();
2620 }
2621 
2622 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2623                                    address *prolog_ptr)
2624 {
2625   int calltype = (((ret_type & 0x3) << 8) |
2626                   ((fp_arg_count & 0xf) << 4) |
2627                   (gp_arg_count & 0xf));
2628 
2629   // the addresses for the x86 to ARM entry code we need to use
2630   address start = pc();
2631   // printf("start = %lx\n", start);
2632   int byteCount =  aarch64_stub_prolog_size();
2633   // printf("byteCount = %x\n", byteCount);
2634   int instructionCount = (byteCount + 3)/ 4;
2635   // printf("instructionCount = %x\n", instructionCount);
2636   for (int i = 0; i < instructionCount; i++) {
2637     nop();
2638   }
2639 
2640   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2641 
2642   // write the address of the setup routine and the call format at the
2643   // end of into the copied code
2644   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2645   if (prolog_ptr)
2646     patch_end[-2] = (u_int64_t)prolog_ptr;
2647   patch_end[-1] = calltype;
2648 }
2649 #endif
2650 
2651 void MacroAssembler::push_call_clobbered_registers() {
2652   int step = 4 * wordSize;
2653   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2654   sub(sp, sp, step);
2655   mov(rscratch1, -step);
2656   // Push v0-v7, v16-v31.
2657   for (int i = 31; i>= 4; i -= 4) {
2658     if (i <= v7->encoding() || i >= v16->encoding())
2659       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2660           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2661   }
2662   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2663       as_FloatRegister(3), T1D, Address(sp));
2664 }
2665 
2666 void MacroAssembler::pop_call_clobbered_registers() {
2667   for (int i = 0; i < 32; i += 4) {
2668     if (i <= v7->encoding() || i >= v16->encoding())
2669       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2670           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2671   }
2672 
2673   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2674 }
2675 
2676 void MacroAssembler::push_CPU_state(bool save_vectors) {
2677   int step = (save_vectors ? 8 : 4) * wordSize;
2678   push(0x3fffffff, sp);         // integer registers except lr & sp
2679   mov(rscratch1, -step);
2680   sub(sp, sp, step);
2681   for (int i = 28; i >= 4; i -= 4) {
2682     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2683         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2684   }
2685   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2686 }
2687 
2688 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2689   int step = (restore_vectors ? 8 : 4) * wordSize;
2690   for (int i = 0; i <= 28; i += 4)
2691     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2692         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2693   pop(0x3fffffff, sp);         // integer registers except lr & sp
2694 }
2695 
2696 /**
2697  * Helpers for multiply_to_len().
2698  */
2699 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2700                                      Register src1, Register src2) {
2701   adds(dest_lo, dest_lo, src1);
2702   adc(dest_hi, dest_hi, zr);
2703   adds(dest_lo, dest_lo, src2);
2704   adc(final_dest_hi, dest_hi, zr);
2705 }
2706 
2707 // Generate an address from (r + r1 extend offset).  "size" is the
2708 // size of the operand.  The result may be in rscratch2.
2709 Address MacroAssembler::offsetted_address(Register r, Register r1,
2710                                           Address::extend ext, int offset, int size) {
2711   if (offset || (ext.shift() % size != 0)) {
2712     lea(rscratch2, Address(r, r1, ext));
2713     return Address(rscratch2, offset);
2714   } else {
2715     return Address(r, r1, ext);
2716   }
2717 }
2718 
2719 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2720 {
2721   assert(offset >= 0, "spill to negative address?");
2722   // Offset reachable ?
2723   //   Not aligned - 9 bits signed offset
2724   //   Aligned - 12 bits unsigned offset shifted
2725   Register base = sp;
2726   if ((offset & (size-1)) && offset >= (1<<8)) {
2727     add(tmp, base, offset & ((1<<12)-1));
2728     base = tmp;
2729     offset &= -1u<<12;
2730   }
2731 
2732   if (offset >= (1<<12) * size) {
2733     add(tmp, base, offset & (((1<<12)-1)<<12));
2734     base = tmp;
2735     offset &= ~(((1<<12)-1)<<12);
2736   }
2737 
2738   return Address(base, offset);
2739 }
2740 
2741 // Checks whether offset is aligned.
2742 // Returns true if it is, else false.
2743 bool MacroAssembler::merge_alignment_check(Register base,
2744                                            size_t size,
2745                                            long cur_offset,
2746                                            long prev_offset) const {
2747   if (AvoidUnalignedAccesses) {
2748     if (base == sp) {
2749       // Checks whether low offset if aligned to pair of registers.
2750       long pair_mask = size * 2 - 1;
2751       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2752       return (offset & pair_mask) == 0;
2753     } else { // If base is not sp, we can't guarantee the access is aligned.
2754       return false;
2755     }
2756   } else {
2757     long mask = size - 1;
2758     // Load/store pair instruction only supports element size aligned offset.
2759     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2760   }
2761 }
2762 
2763 // Checks whether current and previous loads/stores can be merged.
2764 // Returns true if it can be merged, else false.
2765 bool MacroAssembler::ldst_can_merge(Register rt,
2766                                     const Address &adr,
2767                                     size_t cur_size_in_bytes,
2768                                     bool is_store) const {
2769   address prev = pc() - NativeInstruction::instruction_size;
2770   address last = code()->last_insn();
2771 
2772   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2773     return false;
2774   }
2775 
2776   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2777     return false;
2778   }
2779 
2780   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2781   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2782 
2783   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2784   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2785 
2786   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2787     return false;
2788   }
2789 
2790   long max_offset = 63 * prev_size_in_bytes;
2791   long min_offset = -64 * prev_size_in_bytes;
2792 
2793   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2794 
2795   // Only same base can be merged.
2796   if (adr.base() != prev_ldst->base()) {
2797     return false;
2798   }
2799 
2800   long cur_offset = adr.offset();
2801   long prev_offset = prev_ldst->offset();
2802   size_t diff = abs(cur_offset - prev_offset);
2803   if (diff != prev_size_in_bytes) {
2804     return false;
2805   }
2806 
2807   // Following cases can not be merged:
2808   // ldr x2, [x2, #8]
2809   // ldr x3, [x2, #16]
2810   // or:
2811   // ldr x2, [x3, #8]
2812   // ldr x2, [x3, #16]
2813   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2814   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2815     return false;
2816   }
2817 
2818   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2819   // Offset range must be in ldp/stp instruction's range.
2820   if (low_offset > max_offset || low_offset < min_offset) {
2821     return false;
2822   }
2823 
2824   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2825     return true;
2826   }
2827 
2828   return false;
2829 }
2830 
2831 // Merge current load/store with previous load/store into ldp/stp.
2832 void MacroAssembler::merge_ldst(Register rt,
2833                                 const Address &adr,
2834                                 size_t cur_size_in_bytes,
2835                                 bool is_store) {
2836 
2837   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2838 
2839   Register rt_low, rt_high;
2840   address prev = pc() - NativeInstruction::instruction_size;
2841   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2842 
2843   long offset;
2844 
2845   if (adr.offset() < prev_ldst->offset()) {
2846     offset = adr.offset();
2847     rt_low = rt;
2848     rt_high = prev_ldst->target();
2849   } else {
2850     offset = prev_ldst->offset();
2851     rt_low = prev_ldst->target();
2852     rt_high = rt;
2853   }
2854 
2855   Address adr_p = Address(prev_ldst->base(), offset);
2856   // Overwrite previous generated binary.
2857   code_section()->set_end(prev);
2858 
2859   const int sz = prev_ldst->size_in_bytes();
2860   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2861   if (!is_store) {
2862     BLOCK_COMMENT("merged ldr pair");
2863     if (sz == 8) {
2864       ldp(rt_low, rt_high, adr_p);
2865     } else {
2866       ldpw(rt_low, rt_high, adr_p);
2867     }
2868   } else {
2869     BLOCK_COMMENT("merged str pair");
2870     if (sz == 8) {
2871       stp(rt_low, rt_high, adr_p);
2872     } else {
2873       stpw(rt_low, rt_high, adr_p);
2874     }
2875   }
2876 }
2877 
2878 /**
2879  * Multiply 64 bit by 64 bit first loop.
2880  */
2881 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2882                                            Register y, Register y_idx, Register z,
2883                                            Register carry, Register product,
2884                                            Register idx, Register kdx) {
2885   //
2886   //  jlong carry, x[], y[], z[];
2887   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2888   //    huge_128 product = y[idx] * x[xstart] + carry;
2889   //    z[kdx] = (jlong)product;
2890   //    carry  = (jlong)(product >>> 64);
2891   //  }
2892   //  z[xstart] = carry;
2893   //
2894 
2895   Label L_first_loop, L_first_loop_exit;
2896   Label L_one_x, L_one_y, L_multiply;
2897 
2898   subsw(xstart, xstart, 1);
2899   br(Assembler::MI, L_one_x);
2900 
2901   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2902   ldr(x_xstart, Address(rscratch1));
2903   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2904 
2905   bind(L_first_loop);
2906   subsw(idx, idx, 1);
2907   br(Assembler::MI, L_first_loop_exit);
2908   subsw(idx, idx, 1);
2909   br(Assembler::MI, L_one_y);
2910   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2911   ldr(y_idx, Address(rscratch1));
2912   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2913   bind(L_multiply);
2914 
2915   // AArch64 has a multiply-accumulate instruction that we can't use
2916   // here because it has no way to process carries, so we have to use
2917   // separate add and adc instructions.  Bah.
2918   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2919   mul(product, x_xstart, y_idx);
2920   adds(product, product, carry);
2921   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2922 
2923   subw(kdx, kdx, 2);
2924   ror(product, product, 32); // back to big-endian
2925   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2926 
2927   b(L_first_loop);
2928 
2929   bind(L_one_y);
2930   ldrw(y_idx, Address(y,  0));
2931   b(L_multiply);
2932 
2933   bind(L_one_x);
2934   ldrw(x_xstart, Address(x,  0));
2935   b(L_first_loop);
2936 
2937   bind(L_first_loop_exit);
2938 }
2939 
2940 /**
2941  * Multiply 128 bit by 128. Unrolled inner loop.
2942  *
2943  */
2944 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2945                                              Register carry, Register carry2,
2946                                              Register idx, Register jdx,
2947                                              Register yz_idx1, Register yz_idx2,
2948                                              Register tmp, Register tmp3, Register tmp4,
2949                                              Register tmp6, Register product_hi) {
2950 
2951   //   jlong carry, x[], y[], z[];
2952   //   int kdx = ystart+1;
2953   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2954   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2955   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2956   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2957   //     carry  = (jlong)(tmp4 >>> 64);
2958   //     z[kdx+idx+1] = (jlong)tmp3;
2959   //     z[kdx+idx] = (jlong)tmp4;
2960   //   }
2961   //   idx += 2;
2962   //   if (idx > 0) {
2963   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2964   //     z[kdx+idx] = (jlong)yz_idx1;
2965   //     carry  = (jlong)(yz_idx1 >>> 64);
2966   //   }
2967   //
2968 
2969   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2970 
2971   lsrw(jdx, idx, 2);
2972 
2973   bind(L_third_loop);
2974 
2975   subsw(jdx, jdx, 1);
2976   br(Assembler::MI, L_third_loop_exit);
2977   subw(idx, idx, 4);
2978 
2979   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2980 
2981   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2982 
2983   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2984 
2985   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2986   ror(yz_idx2, yz_idx2, 32);
2987 
2988   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2989 
2990   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2991   umulh(tmp4, product_hi, yz_idx1);
2992 
2993   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2994   ror(rscratch2, rscratch2, 32);
2995 
2996   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2997   umulh(carry2, product_hi, yz_idx2);
2998 
2999   // propagate sum of both multiplications into carry:tmp4:tmp3
3000   adds(tmp3, tmp3, carry);
3001   adc(tmp4, tmp4, zr);
3002   adds(tmp3, tmp3, rscratch1);
3003   adcs(tmp4, tmp4, tmp);
3004   adc(carry, carry2, zr);
3005   adds(tmp4, tmp4, rscratch2);
3006   adc(carry, carry, zr);
3007 
3008   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
3009   ror(tmp4, tmp4, 32);
3010   stp(tmp4, tmp3, Address(tmp6, 0));
3011 
3012   b(L_third_loop);
3013   bind (L_third_loop_exit);
3014 
3015   andw (idx, idx, 0x3);
3016   cbz(idx, L_post_third_loop_done);
3017 
3018   Label L_check_1;
3019   subsw(idx, idx, 2);
3020   br(Assembler::MI, L_check_1);
3021 
3022   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3023   ldr(yz_idx1, Address(rscratch1, 0));
3024   ror(yz_idx1, yz_idx1, 32);
3025   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
3026   umulh(tmp4, product_hi, yz_idx1);
3027   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3028   ldr(yz_idx2, Address(rscratch1, 0));
3029   ror(yz_idx2, yz_idx2, 32);
3030 
3031   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
3032 
3033   ror(tmp3, tmp3, 32);
3034   str(tmp3, Address(rscratch1, 0));
3035 
3036   bind (L_check_1);
3037 
3038   andw (idx, idx, 0x1);
3039   subsw(idx, idx, 1);
3040   br(Assembler::MI, L_post_third_loop_done);
3041   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3042   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
3043   umulh(carry2, tmp4, product_hi);
3044   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3045 
3046   add2_with_carry(carry2, tmp3, tmp4, carry);
3047 
3048   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3049   extr(carry, carry2, tmp3, 32);
3050 
3051   bind(L_post_third_loop_done);
3052 }
3053 
3054 /**
3055  * Code for BigInteger::multiplyToLen() instrinsic.
3056  *
3057  * r0: x
3058  * r1: xlen
3059  * r2: y
3060  * r3: ylen
3061  * r4:  z
3062  * r5: zlen
3063  * r10: tmp1
3064  * r11: tmp2
3065  * r12: tmp3
3066  * r13: tmp4
3067  * r14: tmp5
3068  * r15: tmp6
3069  * r16: tmp7
3070  *
3071  */
3072 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3073                                      Register z, Register zlen,
3074                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3075                                      Register tmp5, Register tmp6, Register product_hi) {
3076 
3077   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3078 
3079   const Register idx = tmp1;
3080   const Register kdx = tmp2;
3081   const Register xstart = tmp3;
3082 
3083   const Register y_idx = tmp4;
3084   const Register carry = tmp5;
3085   const Register product  = xlen;
3086   const Register x_xstart = zlen;  // reuse register
3087 
3088   // First Loop.
3089   //
3090   //  final static long LONG_MASK = 0xffffffffL;
3091   //  int xstart = xlen - 1;
3092   //  int ystart = ylen - 1;
3093   //  long carry = 0;
3094   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3095   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3096   //    z[kdx] = (int)product;
3097   //    carry = product >>> 32;
3098   //  }
3099   //  z[xstart] = (int)carry;
3100   //
3101 
3102   movw(idx, ylen);      // idx = ylen;
3103   movw(kdx, zlen);      // kdx = xlen+ylen;
3104   mov(carry, zr);       // carry = 0;
3105 
3106   Label L_done;
3107 
3108   movw(xstart, xlen);
3109   subsw(xstart, xstart, 1);
3110   br(Assembler::MI, L_done);
3111 
3112   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3113 
3114   Label L_second_loop;
3115   cbzw(kdx, L_second_loop);
3116 
3117   Label L_carry;
3118   subw(kdx, kdx, 1);
3119   cbzw(kdx, L_carry);
3120 
3121   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3122   lsr(carry, carry, 32);
3123   subw(kdx, kdx, 1);
3124 
3125   bind(L_carry);
3126   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3127 
3128   // Second and third (nested) loops.
3129   //
3130   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3131   //   carry = 0;
3132   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3133   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3134   //                    (z[k] & LONG_MASK) + carry;
3135   //     z[k] = (int)product;
3136   //     carry = product >>> 32;
3137   //   }
3138   //   z[i] = (int)carry;
3139   // }
3140   //
3141   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3142 
3143   const Register jdx = tmp1;
3144 
3145   bind(L_second_loop);
3146   mov(carry, zr);                // carry = 0;
3147   movw(jdx, ylen);               // j = ystart+1
3148 
3149   subsw(xstart, xstart, 1);      // i = xstart-1;
3150   br(Assembler::MI, L_done);
3151 
3152   str(z, Address(pre(sp, -4 * wordSize)));
3153 
3154   Label L_last_x;
3155   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3156   subsw(xstart, xstart, 1);       // i = xstart-1;
3157   br(Assembler::MI, L_last_x);
3158 
3159   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3160   ldr(product_hi, Address(rscratch1));
3161   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3162 
3163   Label L_third_loop_prologue;
3164   bind(L_third_loop_prologue);
3165 
3166   str(ylen, Address(sp, wordSize));
3167   stp(x, xstart, Address(sp, 2 * wordSize));
3168   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3169                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3170   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3171   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3172 
3173   addw(tmp3, xlen, 1);
3174   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3175   subsw(tmp3, tmp3, 1);
3176   br(Assembler::MI, L_done);
3177 
3178   lsr(carry, carry, 32);
3179   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3180   b(L_second_loop);
3181 
3182   // Next infrequent code is moved outside loops.
3183   bind(L_last_x);
3184   ldrw(product_hi, Address(x,  0));
3185   b(L_third_loop_prologue);
3186 
3187   bind(L_done);
3188 }
3189 
3190 // Code for BigInteger::mulAdd instrinsic
3191 // out     = r0
3192 // in      = r1
3193 // offset  = r2  (already out.length-offset)
3194 // len     = r3
3195 // k       = r4
3196 //
3197 // pseudo code from java implementation:
3198 // carry = 0;
3199 // offset = out.length-offset - 1;
3200 // for (int j=len-1; j >= 0; j--) {
3201 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3202 //     out[offset--] = (int)product;
3203 //     carry = product >>> 32;
3204 // }
3205 // return (int)carry;
3206 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3207       Register len, Register k) {
3208     Label LOOP, END;
3209     // pre-loop
3210     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3211     csel(out, zr, out, Assembler::EQ);
3212     br(Assembler::EQ, END);
3213     add(in, in, len, LSL, 2); // in[j+1] address
3214     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3215     mov(out, zr); // used to keep carry now
3216     BIND(LOOP);
3217     ldrw(rscratch1, Address(pre(in, -4)));
3218     madd(rscratch1, rscratch1, k, out);
3219     ldrw(rscratch2, Address(pre(offset, -4)));
3220     add(rscratch1, rscratch1, rscratch2);
3221     strw(rscratch1, Address(offset));
3222     lsr(out, rscratch1, 32);
3223     subs(len, len, 1);
3224     br(Assembler::NE, LOOP);
3225     BIND(END);
3226 }
3227 
3228 /**
3229  * Emits code to update CRC-32 with a byte value according to constants in table
3230  *
3231  * @param [in,out]crc   Register containing the crc.
3232  * @param [in]val       Register containing the byte to fold into the CRC.
3233  * @param [in]table     Register containing the table of crc constants.
3234  *
3235  * uint32_t crc;
3236  * val = crc_table[(val ^ crc) & 0xFF];
3237  * crc = val ^ (crc >> 8);
3238  *
3239  */
3240 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3241   eor(val, val, crc);
3242   andr(val, val, 0xff);
3243   ldrw(val, Address(table, val, Address::lsl(2)));
3244   eor(crc, val, crc, Assembler::LSR, 8);
3245 }
3246 
3247 /**
3248  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3249  *
3250  * @param [in,out]crc   Register containing the crc.
3251  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3252  * @param [in]table0    Register containing table 0 of crc constants.
3253  * @param [in]table1    Register containing table 1 of crc constants.
3254  * @param [in]table2    Register containing table 2 of crc constants.
3255  * @param [in]table3    Register containing table 3 of crc constants.
3256  *
3257  * uint32_t crc;
3258  *   v = crc ^ v
3259  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3260  *
3261  */
3262 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3263         Register table0, Register table1, Register table2, Register table3,
3264         bool upper) {
3265   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3266   uxtb(tmp, v);
3267   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3268   ubfx(tmp, v, 8, 8);
3269   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3270   eor(crc, crc, tmp);
3271   ubfx(tmp, v, 16, 8);
3272   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3273   eor(crc, crc, tmp);
3274   ubfx(tmp, v, 24, 8);
3275   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3276   eor(crc, crc, tmp);
3277 }
3278 
3279 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3280         Register len, Register tmp0, Register tmp1, Register tmp2,
3281         Register tmp3) {
3282     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3283     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3284 
3285     mvnw(crc, crc);
3286 
3287     subs(len, len, 128);
3288     br(Assembler::GE, CRC_by64_pre);
3289   BIND(CRC_less64);
3290     adds(len, len, 128-32);
3291     br(Assembler::GE, CRC_by32_loop);
3292   BIND(CRC_less32);
3293     adds(len, len, 32-4);
3294     br(Assembler::GE, CRC_by4_loop);
3295     adds(len, len, 4);
3296     br(Assembler::GT, CRC_by1_loop);
3297     b(L_exit);
3298 
3299   BIND(CRC_by32_loop);
3300     ldp(tmp0, tmp1, Address(post(buf, 16)));
3301     subs(len, len, 32);
3302     crc32x(crc, crc, tmp0);
3303     ldr(tmp2, Address(post(buf, 8)));
3304     crc32x(crc, crc, tmp1);
3305     ldr(tmp3, Address(post(buf, 8)));
3306     crc32x(crc, crc, tmp2);
3307     crc32x(crc, crc, tmp3);
3308     br(Assembler::GE, CRC_by32_loop);
3309     cmn(len, 32);
3310     br(Assembler::NE, CRC_less32);
3311     b(L_exit);
3312 
3313   BIND(CRC_by4_loop);
3314     ldrw(tmp0, Address(post(buf, 4)));
3315     subs(len, len, 4);
3316     crc32w(crc, crc, tmp0);
3317     br(Assembler::GE, CRC_by4_loop);
3318     adds(len, len, 4);
3319     br(Assembler::LE, L_exit);
3320   BIND(CRC_by1_loop);
3321     ldrb(tmp0, Address(post(buf, 1)));
3322     subs(len, len, 1);
3323     crc32b(crc, crc, tmp0);
3324     br(Assembler::GT, CRC_by1_loop);
3325     b(L_exit);
3326 
3327   BIND(CRC_by64_pre);
3328     sub(buf, buf, 8);
3329     ldp(tmp0, tmp1, Address(buf, 8));
3330     crc32x(crc, crc, tmp0);
3331     ldr(tmp2, Address(buf, 24));
3332     crc32x(crc, crc, tmp1);
3333     ldr(tmp3, Address(buf, 32));
3334     crc32x(crc, crc, tmp2);
3335     ldr(tmp0, Address(buf, 40));
3336     crc32x(crc, crc, tmp3);
3337     ldr(tmp1, Address(buf, 48));
3338     crc32x(crc, crc, tmp0);
3339     ldr(tmp2, Address(buf, 56));
3340     crc32x(crc, crc, tmp1);
3341     ldr(tmp3, Address(pre(buf, 64)));
3342 
3343     b(CRC_by64_loop);
3344 
3345     align(CodeEntryAlignment);
3346   BIND(CRC_by64_loop);
3347     subs(len, len, 64);
3348     crc32x(crc, crc, tmp2);
3349     ldr(tmp0, Address(buf, 8));
3350     crc32x(crc, crc, tmp3);
3351     ldr(tmp1, Address(buf, 16));
3352     crc32x(crc, crc, tmp0);
3353     ldr(tmp2, Address(buf, 24));
3354     crc32x(crc, crc, tmp1);
3355     ldr(tmp3, Address(buf, 32));
3356     crc32x(crc, crc, tmp2);
3357     ldr(tmp0, Address(buf, 40));
3358     crc32x(crc, crc, tmp3);
3359     ldr(tmp1, Address(buf, 48));
3360     crc32x(crc, crc, tmp0);
3361     ldr(tmp2, Address(buf, 56));
3362     crc32x(crc, crc, tmp1);
3363     ldr(tmp3, Address(pre(buf, 64)));
3364     br(Assembler::GE, CRC_by64_loop);
3365 
3366     // post-loop
3367     crc32x(crc, crc, tmp2);
3368     crc32x(crc, crc, tmp3);
3369 
3370     sub(len, len, 64);
3371     add(buf, buf, 8);
3372     cmn(len, 128);
3373     br(Assembler::NE, CRC_less64);
3374   BIND(L_exit);
3375     mvnw(crc, crc);
3376 }
3377 
3378 /**
3379  * @param crc   register containing existing CRC (32-bit)
3380  * @param buf   register pointing to input byte buffer (byte*)
3381  * @param len   register containing number of bytes
3382  * @param table register that will contain address of CRC table
3383  * @param tmp   scratch register
3384  */
3385 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3386         Register table0, Register table1, Register table2, Register table3,
3387         Register tmp, Register tmp2, Register tmp3) {
3388   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3389   unsigned long offset;
3390 
3391   if (UseCRC32) {
3392       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3393       return;
3394   }
3395 
3396     mvnw(crc, crc);
3397 
3398     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3399     if (offset) add(table0, table0, offset);
3400     add(table1, table0, 1*256*sizeof(juint));
3401     add(table2, table0, 2*256*sizeof(juint));
3402     add(table3, table0, 3*256*sizeof(juint));
3403 
3404   if (UseNeon) {
3405       cmp(len, (u1)64);
3406       br(Assembler::LT, L_by16);
3407       eor(v16, T16B, v16, v16);
3408 
3409     Label L_fold;
3410 
3411       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3412 
3413       ld1(v0, v1, T2D, post(buf, 32));
3414       ld1r(v4, T2D, post(tmp, 8));
3415       ld1r(v5, T2D, post(tmp, 8));
3416       ld1r(v6, T2D, post(tmp, 8));
3417       ld1r(v7, T2D, post(tmp, 8));
3418       mov(v16, T4S, 0, crc);
3419 
3420       eor(v0, T16B, v0, v16);
3421       sub(len, len, 64);
3422 
3423     BIND(L_fold);
3424       pmull(v22, T8H, v0, v5, T8B);
3425       pmull(v20, T8H, v0, v7, T8B);
3426       pmull(v23, T8H, v0, v4, T8B);
3427       pmull(v21, T8H, v0, v6, T8B);
3428 
3429       pmull2(v18, T8H, v0, v5, T16B);
3430       pmull2(v16, T8H, v0, v7, T16B);
3431       pmull2(v19, T8H, v0, v4, T16B);
3432       pmull2(v17, T8H, v0, v6, T16B);
3433 
3434       uzp1(v24, T8H, v20, v22);
3435       uzp2(v25, T8H, v20, v22);
3436       eor(v20, T16B, v24, v25);
3437 
3438       uzp1(v26, T8H, v16, v18);
3439       uzp2(v27, T8H, v16, v18);
3440       eor(v16, T16B, v26, v27);
3441 
3442       ushll2(v22, T4S, v20, T8H, 8);
3443       ushll(v20, T4S, v20, T4H, 8);
3444 
3445       ushll2(v18, T4S, v16, T8H, 8);
3446       ushll(v16, T4S, v16, T4H, 8);
3447 
3448       eor(v22, T16B, v23, v22);
3449       eor(v18, T16B, v19, v18);
3450       eor(v20, T16B, v21, v20);
3451       eor(v16, T16B, v17, v16);
3452 
3453       uzp1(v17, T2D, v16, v20);
3454       uzp2(v21, T2D, v16, v20);
3455       eor(v17, T16B, v17, v21);
3456 
3457       ushll2(v20, T2D, v17, T4S, 16);
3458       ushll(v16, T2D, v17, T2S, 16);
3459 
3460       eor(v20, T16B, v20, v22);
3461       eor(v16, T16B, v16, v18);
3462 
3463       uzp1(v17, T2D, v20, v16);
3464       uzp2(v21, T2D, v20, v16);
3465       eor(v28, T16B, v17, v21);
3466 
3467       pmull(v22, T8H, v1, v5, T8B);
3468       pmull(v20, T8H, v1, v7, T8B);
3469       pmull(v23, T8H, v1, v4, T8B);
3470       pmull(v21, T8H, v1, v6, T8B);
3471 
3472       pmull2(v18, T8H, v1, v5, T16B);
3473       pmull2(v16, T8H, v1, v7, T16B);
3474       pmull2(v19, T8H, v1, v4, T16B);
3475       pmull2(v17, T8H, v1, v6, T16B);
3476 
3477       ld1(v0, v1, T2D, post(buf, 32));
3478 
3479       uzp1(v24, T8H, v20, v22);
3480       uzp2(v25, T8H, v20, v22);
3481       eor(v20, T16B, v24, v25);
3482 
3483       uzp1(v26, T8H, v16, v18);
3484       uzp2(v27, T8H, v16, v18);
3485       eor(v16, T16B, v26, v27);
3486 
3487       ushll2(v22, T4S, v20, T8H, 8);
3488       ushll(v20, T4S, v20, T4H, 8);
3489 
3490       ushll2(v18, T4S, v16, T8H, 8);
3491       ushll(v16, T4S, v16, T4H, 8);
3492 
3493       eor(v22, T16B, v23, v22);
3494       eor(v18, T16B, v19, v18);
3495       eor(v20, T16B, v21, v20);
3496       eor(v16, T16B, v17, v16);
3497 
3498       uzp1(v17, T2D, v16, v20);
3499       uzp2(v21, T2D, v16, v20);
3500       eor(v16, T16B, v17, v21);
3501 
3502       ushll2(v20, T2D, v16, T4S, 16);
3503       ushll(v16, T2D, v16, T2S, 16);
3504 
3505       eor(v20, T16B, v22, v20);
3506       eor(v16, T16B, v16, v18);
3507 
3508       uzp1(v17, T2D, v20, v16);
3509       uzp2(v21, T2D, v20, v16);
3510       eor(v20, T16B, v17, v21);
3511 
3512       shl(v16, T2D, v28, 1);
3513       shl(v17, T2D, v20, 1);
3514 
3515       eor(v0, T16B, v0, v16);
3516       eor(v1, T16B, v1, v17);
3517 
3518       subs(len, len, 32);
3519       br(Assembler::GE, L_fold);
3520 
3521       mov(crc, 0);
3522       mov(tmp, v0, T1D, 0);
3523       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3524       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3525       mov(tmp, v0, T1D, 1);
3526       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3527       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3528       mov(tmp, v1, T1D, 0);
3529       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3530       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3531       mov(tmp, v1, T1D, 1);
3532       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3533       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3534 
3535       add(len, len, 32);
3536   }
3537 
3538   BIND(L_by16);
3539     subs(len, len, 16);
3540     br(Assembler::GE, L_by16_loop);
3541     adds(len, len, 16-4);
3542     br(Assembler::GE, L_by4_loop);
3543     adds(len, len, 4);
3544     br(Assembler::GT, L_by1_loop);
3545     b(L_exit);
3546 
3547   BIND(L_by4_loop);
3548     ldrw(tmp, Address(post(buf, 4)));
3549     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3550     subs(len, len, 4);
3551     br(Assembler::GE, L_by4_loop);
3552     adds(len, len, 4);
3553     br(Assembler::LE, L_exit);
3554   BIND(L_by1_loop);
3555     subs(len, len, 1);
3556     ldrb(tmp, Address(post(buf, 1)));
3557     update_byte_crc32(crc, tmp, table0);
3558     br(Assembler::GT, L_by1_loop);
3559     b(L_exit);
3560 
3561     align(CodeEntryAlignment);
3562   BIND(L_by16_loop);
3563     subs(len, len, 16);
3564     ldp(tmp, tmp3, Address(post(buf, 16)));
3565     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3566     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3567     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3568     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3569     br(Assembler::GE, L_by16_loop);
3570     adds(len, len, 16-4);
3571     br(Assembler::GE, L_by4_loop);
3572     adds(len, len, 4);
3573     br(Assembler::GT, L_by1_loop);
3574   BIND(L_exit);
3575     mvnw(crc, crc);
3576 }
3577 
3578 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3579         Register len, Register tmp0, Register tmp1, Register tmp2,
3580         Register tmp3) {
3581     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3582     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3583 
3584     subs(len, len, 128);
3585     br(Assembler::GE, CRC_by64_pre);
3586   BIND(CRC_less64);
3587     adds(len, len, 128-32);
3588     br(Assembler::GE, CRC_by32_loop);
3589   BIND(CRC_less32);
3590     adds(len, len, 32-4);
3591     br(Assembler::GE, CRC_by4_loop);
3592     adds(len, len, 4);
3593     br(Assembler::GT, CRC_by1_loop);
3594     b(L_exit);
3595 
3596   BIND(CRC_by32_loop);
3597     ldp(tmp0, tmp1, Address(post(buf, 16)));
3598     subs(len, len, 32);
3599     crc32cx(crc, crc, tmp0);
3600     ldr(tmp2, Address(post(buf, 8)));
3601     crc32cx(crc, crc, tmp1);
3602     ldr(tmp3, Address(post(buf, 8)));
3603     crc32cx(crc, crc, tmp2);
3604     crc32cx(crc, crc, tmp3);
3605     br(Assembler::GE, CRC_by32_loop);
3606     cmn(len, 32);
3607     br(Assembler::NE, CRC_less32);
3608     b(L_exit);
3609 
3610   BIND(CRC_by4_loop);
3611     ldrw(tmp0, Address(post(buf, 4)));
3612     subs(len, len, 4);
3613     crc32cw(crc, crc, tmp0);
3614     br(Assembler::GE, CRC_by4_loop);
3615     adds(len, len, 4);
3616     br(Assembler::LE, L_exit);
3617   BIND(CRC_by1_loop);
3618     ldrb(tmp0, Address(post(buf, 1)));
3619     subs(len, len, 1);
3620     crc32cb(crc, crc, tmp0);
3621     br(Assembler::GT, CRC_by1_loop);
3622     b(L_exit);
3623 
3624   BIND(CRC_by64_pre);
3625     sub(buf, buf, 8);
3626     ldp(tmp0, tmp1, Address(buf, 8));
3627     crc32cx(crc, crc, tmp0);
3628     ldr(tmp2, Address(buf, 24));
3629     crc32cx(crc, crc, tmp1);
3630     ldr(tmp3, Address(buf, 32));
3631     crc32cx(crc, crc, tmp2);
3632     ldr(tmp0, Address(buf, 40));
3633     crc32cx(crc, crc, tmp3);
3634     ldr(tmp1, Address(buf, 48));
3635     crc32cx(crc, crc, tmp0);
3636     ldr(tmp2, Address(buf, 56));
3637     crc32cx(crc, crc, tmp1);
3638     ldr(tmp3, Address(pre(buf, 64)));
3639 
3640     b(CRC_by64_loop);
3641 
3642     align(CodeEntryAlignment);
3643   BIND(CRC_by64_loop);
3644     subs(len, len, 64);
3645     crc32cx(crc, crc, tmp2);
3646     ldr(tmp0, Address(buf, 8));
3647     crc32cx(crc, crc, tmp3);
3648     ldr(tmp1, Address(buf, 16));
3649     crc32cx(crc, crc, tmp0);
3650     ldr(tmp2, Address(buf, 24));
3651     crc32cx(crc, crc, tmp1);
3652     ldr(tmp3, Address(buf, 32));
3653     crc32cx(crc, crc, tmp2);
3654     ldr(tmp0, Address(buf, 40));
3655     crc32cx(crc, crc, tmp3);
3656     ldr(tmp1, Address(buf, 48));
3657     crc32cx(crc, crc, tmp0);
3658     ldr(tmp2, Address(buf, 56));
3659     crc32cx(crc, crc, tmp1);
3660     ldr(tmp3, Address(pre(buf, 64)));
3661     br(Assembler::GE, CRC_by64_loop);
3662 
3663     // post-loop
3664     crc32cx(crc, crc, tmp2);
3665     crc32cx(crc, crc, tmp3);
3666 
3667     sub(len, len, 64);
3668     add(buf, buf, 8);
3669     cmn(len, 128);
3670     br(Assembler::NE, CRC_less64);
3671   BIND(L_exit);
3672 }
3673 
3674 /**
3675  * @param crc   register containing existing CRC (32-bit)
3676  * @param buf   register pointing to input byte buffer (byte*)
3677  * @param len   register containing number of bytes
3678  * @param table register that will contain address of CRC table
3679  * @param tmp   scratch register
3680  */
3681 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3682         Register table0, Register table1, Register table2, Register table3,
3683         Register tmp, Register tmp2, Register tmp3) {
3684   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3685 }
3686 
3687 
3688 SkipIfEqual::SkipIfEqual(
3689     MacroAssembler* masm, const bool* flag_addr, bool value) {
3690   _masm = masm;
3691   unsigned long offset;
3692   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3693   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3694   _masm->cbzw(rscratch1, _label);
3695 }
3696 
3697 SkipIfEqual::~SkipIfEqual() {
3698   _masm->bind(_label);
3699 }
3700 
3701 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3702   Address adr;
3703   switch(dst.getMode()) {
3704   case Address::base_plus_offset:
3705     // This is the expected mode, although we allow all the other
3706     // forms below.
3707     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3708     break;
3709   default:
3710     lea(rscratch2, dst);
3711     adr = Address(rscratch2);
3712     break;
3713   }
3714   ldr(rscratch1, adr);
3715   add(rscratch1, rscratch1, src);
3716   str(rscratch1, adr);
3717 }
3718 
3719 void MacroAssembler::cmpptr(Register src1, Address src2) {
3720   unsigned long offset;
3721   adrp(rscratch1, src2, offset);
3722   ldr(rscratch1, Address(rscratch1, offset));
3723   cmp(src1, rscratch1);
3724 }
3725 
3726 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3727   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3728   bs->obj_equals(this, obj1, obj2);
3729 }
3730 
3731 void MacroAssembler::load_metadata(Register dst, Register src) {
3732   if (UseCompressedClassPointers) {
3733     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3734   } else {
3735     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3736   }
3737 }
3738 
3739 void MacroAssembler::load_klass(Register dst, Register src) {
3740   load_metadata(dst, src);
3741   if (UseCompressedClassPointers) {
3742     andr(dst, dst, oopDesc::compressed_klass_mask());
3743     decode_klass_not_null(dst);
3744   } else {
3745     ubfm(dst, dst, 0, 63 - oopDesc::storage_props_nof_bits);
3746   }
3747 }
3748 
3749 // ((OopHandle)result).resolve();
3750 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3751   // OopHandle::resolve is an indirection.
3752   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3753 }
3754 
3755 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3756   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3757   ldr(dst, Address(rmethod, Method::const_offset()));
3758   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3759   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3760   ldr(dst, Address(dst, mirror_offset));
3761   resolve_oop_handle(dst, tmp);
3762 }
3763 
3764 void MacroAssembler::load_storage_props(Register dst, Register src) {
3765   load_metadata(dst, src);
3766   if (UseCompressedClassPointers) {
3767     asrw(dst, dst, oopDesc::narrow_storage_props_shift);
3768   } else {
3769     asr(dst, dst, oopDesc::wide_storage_props_shift);
3770   }
3771 }
3772 
3773 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3774   if (UseCompressedClassPointers) {
3775     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3776     if (CompressedKlassPointers::base() == NULL) {
3777       cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift());
3778       return;
3779     } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3780                && CompressedKlassPointers::shift() == 0) {
3781       // Only the bottom 32 bits matter
3782       cmpw(trial_klass, tmp);
3783       return;
3784     }
3785     decode_klass_not_null(tmp);
3786   } else {
3787     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3788   }
3789   cmp(trial_klass, tmp);
3790 }
3791 
3792 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3793   load_klass(dst, src);
3794   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3795 }
3796 
3797 void MacroAssembler::store_klass(Register dst, Register src) {
3798   // FIXME: Should this be a store release?  concurrent gcs assumes
3799   // klass length is valid if klass field is not null.
3800   if (UseCompressedClassPointers) {
3801     encode_klass_not_null(src);
3802     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3803   } else {
3804     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3805   }
3806 }
3807 
3808 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3809   if (UseCompressedClassPointers) {
3810     // Store to klass gap in destination
3811     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3812   }
3813 }
3814 
3815 // Algorithm must match CompressedOops::encode.
3816 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3817 #ifdef ASSERT
3818   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3819 #endif
3820   verify_oop(s, "broken oop in encode_heap_oop");
3821   if (CompressedOops::base() == NULL) {
3822     if (CompressedOops::shift() != 0) {
3823       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3824       lsr(d, s, LogMinObjAlignmentInBytes);
3825     } else {
3826       mov(d, s);
3827     }
3828   } else {
3829     subs(d, s, rheapbase);
3830     csel(d, d, zr, Assembler::HS);
3831     lsr(d, d, LogMinObjAlignmentInBytes);
3832 
3833     /*  Old algorithm: is this any worse?
3834     Label nonnull;
3835     cbnz(r, nonnull);
3836     sub(r, r, rheapbase);
3837     bind(nonnull);
3838     lsr(r, r, LogMinObjAlignmentInBytes);
3839     */
3840   }
3841 }
3842 
3843 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3844 #ifdef ASSERT
3845   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3846   if (CheckCompressedOops) {
3847     Label ok;
3848     cbnz(r, ok);
3849     stop("null oop passed to encode_heap_oop_not_null");
3850     bind(ok);
3851   }
3852 #endif
3853   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3854   if (CompressedOops::base() != NULL) {
3855     sub(r, r, rheapbase);
3856   }
3857   if (CompressedOops::shift() != 0) {
3858     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3859     lsr(r, r, LogMinObjAlignmentInBytes);
3860   }
3861 }
3862 
3863 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3864 #ifdef ASSERT
3865   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3866   if (CheckCompressedOops) {
3867     Label ok;
3868     cbnz(src, ok);
3869     stop("null oop passed to encode_heap_oop_not_null2");
3870     bind(ok);
3871   }
3872 #endif
3873   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3874 
3875   Register data = src;
3876   if (CompressedOops::base() != NULL) {
3877     sub(dst, src, rheapbase);
3878     data = dst;
3879   }
3880   if (CompressedOops::shift() != 0) {
3881     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3882     lsr(dst, data, LogMinObjAlignmentInBytes);
3883     data = dst;
3884   }
3885   if (data == src)
3886     mov(dst, src);
3887 }
3888 
3889 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3890 #ifdef ASSERT
3891   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3892 #endif
3893   if (CompressedOops::base() == NULL) {
3894     if (CompressedOops::shift() != 0 || d != s) {
3895       lsl(d, s, CompressedOops::shift());
3896     }
3897   } else {
3898     Label done;
3899     if (d != s)
3900       mov(d, s);
3901     cbz(s, done);
3902     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3903     bind(done);
3904   }
3905   verify_oop(d, "broken oop in decode_heap_oop");
3906 }
3907 
3908 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3909   assert (UseCompressedOops, "should only be used for compressed headers");
3910   assert (Universe::heap() != NULL, "java heap should be initialized");
3911   // Cannot assert, unverified entry point counts instructions (see .ad file)
3912   // vtableStubs also counts instructions in pd_code_size_limit.
3913   // Also do not verify_oop as this is called by verify_oop.
3914   if (CompressedOops::shift() != 0) {
3915     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3916     if (CompressedOops::base() != NULL) {
3917       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3918     } else {
3919       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3920     }
3921   } else {
3922     assert (CompressedOops::base() == NULL, "sanity");
3923   }
3924 }
3925 
3926 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3927   assert (UseCompressedOops, "should only be used for compressed headers");
3928   assert (Universe::heap() != NULL, "java heap should be initialized");
3929   // Cannot assert, unverified entry point counts instructions (see .ad file)
3930   // vtableStubs also counts instructions in pd_code_size_limit.
3931   // Also do not verify_oop as this is called by verify_oop.
3932   if (CompressedOops::shift() != 0) {
3933     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3934     if (CompressedOops::base() != NULL) {
3935       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3936     } else {
3937       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3938     }
3939   } else {
3940     assert (CompressedOops::base() == NULL, "sanity");
3941     if (dst != src) {
3942       mov(dst, src);
3943     }
3944   }
3945 }
3946 
3947 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3948   if (CompressedKlassPointers::base() == NULL) {
3949     if (CompressedKlassPointers::shift() != 0) {
3950       assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3951       lsr(dst, src, LogKlassAlignmentInBytes);
3952     } else {
3953       if (dst != src) mov(dst, src);
3954     }
3955     return;
3956   }
3957 
3958   if (use_XOR_for_compressed_class_base) {
3959     if (CompressedKlassPointers::shift() != 0) {
3960       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3961       lsr(dst, dst, LogKlassAlignmentInBytes);
3962     } else {
3963       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3964     }
3965     return;
3966   }
3967 
3968   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3969       && CompressedKlassPointers::shift() == 0) {
3970     movw(dst, src);
3971     return;
3972   }
3973 
3974 #ifdef ASSERT
3975   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3976 #endif
3977 
3978   Register rbase = dst;
3979   if (dst == src) rbase = rheapbase;
3980   mov(rbase, (uint64_t)CompressedKlassPointers::base());
3981   sub(dst, src, rbase);
3982   if (CompressedKlassPointers::shift() != 0) {
3983     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3984     lsr(dst, dst, LogKlassAlignmentInBytes);
3985   }
3986   if (dst == src) reinit_heapbase();
3987 }
3988 
3989 void MacroAssembler::encode_klass_not_null(Register r) {
3990   encode_klass_not_null(r, r);
3991 }
3992 
3993 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3994   Register rbase = dst;
3995   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3996 
3997   if (CompressedKlassPointers::base() == NULL) {
3998     if (CompressedKlassPointers::shift() != 0) {
3999       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4000       lsl(dst, src, LogKlassAlignmentInBytes);
4001     } else {
4002       if (dst != src) mov(dst, src);
4003     }
4004     return;
4005   }
4006 
4007   if (use_XOR_for_compressed_class_base) {
4008     if (CompressedKlassPointers::shift() != 0) {
4009       lsl(dst, src, LogKlassAlignmentInBytes);
4010       eor(dst, dst, (uint64_t)CompressedKlassPointers::base());
4011     } else {
4012       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4013     }
4014     return;
4015   }
4016 
4017   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
4018       && CompressedKlassPointers::shift() == 0) {
4019     if (dst != src)
4020       movw(dst, src);
4021     movk(dst, (uint64_t)CompressedKlassPointers::base() >> 32, 32);
4022     return;
4023   }
4024 
4025   // Cannot assert, unverified entry point counts instructions (see .ad file)
4026   // vtableStubs also counts instructions in pd_code_size_limit.
4027   // Also do not verify_oop as this is called by verify_oop.
4028   if (dst == src) rbase = rheapbase;
4029   mov(rbase, (uint64_t)CompressedKlassPointers::base());
4030   if (CompressedKlassPointers::shift() != 0) {
4031     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
4032     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
4033   } else {
4034     add(dst, rbase, src);
4035   }
4036   if (dst == src) reinit_heapbase();
4037 }
4038 
4039 void  MacroAssembler::decode_klass_not_null(Register r) {
4040   decode_klass_not_null(r, r);
4041 }
4042 
4043 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4044 #ifdef ASSERT
4045   {
4046     ThreadInVMfromUnknown tiv;
4047     assert (UseCompressedOops, "should only be used for compressed oops");
4048     assert (Universe::heap() != NULL, "java heap should be initialized");
4049     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4050     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4051   }
4052 #endif
4053   int oop_index = oop_recorder()->find_index(obj);
4054   InstructionMark im(this);
4055   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4056   code_section()->relocate(inst_mark(), rspec);
4057   movz(dst, 0xDEAD, 16);
4058   movk(dst, 0xBEEF);
4059 }
4060 
4061 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4062   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4063   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4064   int index = oop_recorder()->find_index(k);
4065   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
4066 
4067   InstructionMark im(this);
4068   RelocationHolder rspec = metadata_Relocation::spec(index);
4069   code_section()->relocate(inst_mark(), rspec);
4070   narrowKlass nk = CompressedKlassPointers::encode(k);
4071   movz(dst, (nk >> 16), 16);
4072   movk(dst, nk & 0xffff);
4073 }
4074 
4075 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4076                                     Register dst, Address src,
4077                                     Register tmp1, Register thread_tmp) {
4078   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4079   decorators = AccessInternal::decorator_fixup(decorators);
4080   bool as_raw = (decorators & AS_RAW) != 0;
4081   if (as_raw) {
4082     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4083   } else {
4084     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4085   }
4086 }
4087 
4088 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4089                                      Address dst, Register src,
4090                                      Register tmp1, Register thread_tmp, Register tmp3) {
4091   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4092   decorators = AccessInternal::decorator_fixup(decorators);
4093   bool as_raw = (decorators & AS_RAW) != 0;
4094   if (as_raw) {
4095     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp, tmp3);
4096   } else {
4097     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp, tmp3);
4098   }
4099 }
4100 
4101 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4102   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4103   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4104     decorators |= ACCESS_READ | ACCESS_WRITE;
4105   }
4106   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4107   return bs->resolve(this, decorators, obj);
4108 }
4109 
4110 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4111                                    Register thread_tmp, DecoratorSet decorators) {
4112   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4113 }
4114 
4115 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4116                                             Register thread_tmp, DecoratorSet decorators) {
4117   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4118 }
4119 
4120 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4121                                     Register thread_tmp, Register tmp3, DecoratorSet decorators) {
4122   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp, tmp3);
4123 }
4124 
4125 // Used for storing NULLs.
4126 void MacroAssembler::store_heap_oop_null(Address dst) {
4127   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
4128 }
4129 
4130 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4131   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4132   int index = oop_recorder()->allocate_metadata_index(obj);
4133   RelocationHolder rspec = metadata_Relocation::spec(index);
4134   return Address((address)obj, rspec);
4135 }
4136 
4137 // Move an oop into a register.  immediate is true if we want
4138 // immediate instrcutions, i.e. we are not going to patch this
4139 // instruction while the code is being executed by another thread.  In
4140 // that case we can use move immediates rather than the constant pool.
4141 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4142   int oop_index;
4143   if (obj == NULL) {
4144     oop_index = oop_recorder()->allocate_oop_index(obj);
4145   } else {
4146 #ifdef ASSERT
4147     {
4148       ThreadInVMfromUnknown tiv;
4149       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4150     }
4151 #endif
4152     oop_index = oop_recorder()->find_index(obj);
4153   }
4154   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4155   if (! immediate) {
4156     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4157     ldr_constant(dst, Address(dummy, rspec));
4158   } else
4159     mov(dst, Address((address)obj, rspec));
4160 }
4161 
4162 // Move a metadata address into a register.
4163 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4164   int oop_index;
4165   if (obj == NULL) {
4166     oop_index = oop_recorder()->allocate_metadata_index(obj);
4167   } else {
4168     oop_index = oop_recorder()->find_index(obj);
4169   }
4170   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4171   mov(dst, Address((address)obj, rspec));
4172 }
4173 
4174 Address MacroAssembler::constant_oop_address(jobject obj) {
4175 #ifdef ASSERT
4176   {
4177     ThreadInVMfromUnknown tiv;
4178     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4179     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4180   }
4181 #endif
4182   int oop_index = oop_recorder()->find_index(obj);
4183   return Address((address)obj, oop_Relocation::spec(oop_index));
4184 }
4185 
4186 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4187 void MacroAssembler::tlab_allocate(Register obj,
4188                                    Register var_size_in_bytes,
4189                                    int con_size_in_bytes,
4190                                    Register t1,
4191                                    Register t2,
4192                                    Label& slow_case) {
4193   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4194   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4195 }
4196 
4197 // Defines obj, preserves var_size_in_bytes
4198 void MacroAssembler::eden_allocate(Register obj,
4199                                    Register var_size_in_bytes,
4200                                    int con_size_in_bytes,
4201                                    Register t1,
4202                                    Label& slow_case) {
4203   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4204   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4205 }
4206 
4207 // Zero words; len is in bytes
4208 // Destroys all registers except addr
4209 // len must be a nonzero multiple of wordSize
4210 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4211   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4212 
4213 #ifdef ASSERT
4214   { Label L;
4215     tst(len, BytesPerWord - 1);
4216     br(Assembler::EQ, L);
4217     stop("len is not a multiple of BytesPerWord");
4218     bind(L);
4219   }
4220 #endif
4221 
4222 #ifndef PRODUCT
4223   block_comment("zero memory");
4224 #endif
4225 
4226   Label loop;
4227   Label entry;
4228 
4229 //  Algorithm:
4230 //
4231 //    scratch1 = cnt & 7;
4232 //    cnt -= scratch1;
4233 //    p += scratch1;
4234 //    switch (scratch1) {
4235 //      do {
4236 //        cnt -= 8;
4237 //          p[-8] = 0;
4238 //        case 7:
4239 //          p[-7] = 0;
4240 //        case 6:
4241 //          p[-6] = 0;
4242 //          // ...
4243 //        case 1:
4244 //          p[-1] = 0;
4245 //        case 0:
4246 //          p += 8;
4247 //      } while (cnt);
4248 //    }
4249 
4250   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4251 
4252   lsr(len, len, LogBytesPerWord);
4253   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4254   sub(len, len, rscratch1);      // cnt -= unroll
4255   // t1 always points to the end of the region we're about to zero
4256   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4257   adr(rscratch2, entry);
4258   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4259   br(rscratch2);
4260   bind(loop);
4261   sub(len, len, unroll);
4262   for (int i = -unroll; i < 0; i++)
4263     Assembler::str(zr, Address(t1, i * wordSize));
4264   bind(entry);
4265   add(t1, t1, unroll * wordSize);
4266   cbnz(len, loop);
4267 }
4268 
4269 void MacroAssembler::verify_tlab() {
4270 #ifdef ASSERT
4271   if (UseTLAB && VerifyOops) {
4272     Label next, ok;
4273 
4274     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4275 
4276     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4277     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4278     cmp(rscratch2, rscratch1);
4279     br(Assembler::HS, next);
4280     STOP("assert(top >= start)");
4281     should_not_reach_here();
4282 
4283     bind(next);
4284     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4285     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4286     cmp(rscratch2, rscratch1);
4287     br(Assembler::HS, ok);
4288     STOP("assert(top <= end)");
4289     should_not_reach_here();
4290 
4291     bind(ok);
4292     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4293   }
4294 #endif
4295 }
4296 
4297 // Writes to stack successive pages until offset reached to check for
4298 // stack overflow + shadow pages.  This clobbers tmp.
4299 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4300   assert_different_registers(tmp, size, rscratch1);
4301   mov(tmp, sp);
4302   // Bang stack for total size given plus shadow page size.
4303   // Bang one page at a time because large size can bang beyond yellow and
4304   // red zones.
4305   Label loop;
4306   mov(rscratch1, os::vm_page_size());
4307   bind(loop);
4308   lea(tmp, Address(tmp, -os::vm_page_size()));
4309   subsw(size, size, rscratch1);
4310   str(size, Address(tmp));
4311   br(Assembler::GT, loop);
4312 
4313   // Bang down shadow pages too.
4314   // At this point, (tmp-0) is the last address touched, so don't
4315   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4316   // was post-decremented.)  Skip this address by starting at i=1, and
4317   // touch a few more pages below.  N.B.  It is important to touch all
4318   // the way down to and including i=StackShadowPages.
4319   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4320     // this could be any sized move but this is can be a debugging crumb
4321     // so the bigger the better.
4322     lea(tmp, Address(tmp, -os::vm_page_size()));
4323     str(size, Address(tmp));
4324   }
4325 }
4326 
4327 
4328 // Move the address of the polling page into dest.
4329 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4330   if (SafepointMechanism::uses_thread_local_poll()) {
4331     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4332   } else {
4333     unsigned long off;
4334     adrp(dest, Address(page, rtype), off);
4335     assert(off == 0, "polling page must be page aligned");
4336   }
4337 }
4338 
4339 // Move the address of the polling page into r, then read the polling
4340 // page.
4341 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4342   get_polling_page(r, page, rtype);
4343   return read_polling_page(r, rtype);
4344 }
4345 
4346 // Read the polling page.  The address of the polling page must
4347 // already be in r.
4348 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4349   InstructionMark im(this);
4350   code_section()->relocate(inst_mark(), rtype);
4351   ldrw(zr, Address(r, 0));
4352   return inst_mark();
4353 }
4354 
4355 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4356   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4357   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4358   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4359   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4360   long offset_low = dest_page - low_page;
4361   long offset_high = dest_page - high_page;
4362 
4363   assert(is_valid_AArch64_address(dest.target()), "bad address");
4364   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4365 
4366   InstructionMark im(this);
4367   code_section()->relocate(inst_mark(), dest.rspec());
4368   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4369   // the code cache so that if it is relocated we know it will still reach
4370   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4371     _adrp(reg1, dest.target());
4372   } else {
4373     unsigned long target = (unsigned long)dest.target();
4374     unsigned long adrp_target
4375       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4376 
4377     _adrp(reg1, (address)adrp_target);
4378     movk(reg1, target >> 32, 32);
4379   }
4380   byte_offset = (unsigned long)dest.target() & 0xfff;
4381 }
4382 
4383 void MacroAssembler::load_byte_map_base(Register reg) {
4384   CardTable::CardValue* byte_map_base =
4385     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4386 
4387   if (is_valid_AArch64_address((address)byte_map_base)) {
4388     // Strictly speaking the byte_map_base isn't an address at all,
4389     // and it might even be negative.
4390     unsigned long offset;
4391     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4392     // We expect offset to be zero with most collectors.
4393     if (offset != 0) {
4394       add(reg, reg, offset);
4395     }
4396   } else {
4397     mov(reg, (uint64_t)byte_map_base);
4398   }
4399 }
4400 
4401 void MacroAssembler::build_frame(int framesize) {
4402   assert(framesize > 0, "framesize must be > 0");
4403   if (framesize < ((1 << 9) + 2 * wordSize)) {
4404     sub(sp, sp, framesize);
4405     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4406     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4407   } else {
4408     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4409     if (PreserveFramePointer) mov(rfp, sp);
4410     if (framesize < ((1 << 12) + 2 * wordSize))
4411       sub(sp, sp, framesize - 2 * wordSize);
4412     else {
4413       mov(rscratch1, framesize - 2 * wordSize);
4414       sub(sp, sp, rscratch1);
4415     }
4416   }
4417 }
4418 
4419 void MacroAssembler::remove_frame(int framesize) {
4420   assert(framesize > 0, "framesize must be > 0");
4421   if (framesize < ((1 << 9) + 2 * wordSize)) {
4422     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4423     add(sp, sp, framesize);
4424   } else {
4425     if (framesize < ((1 << 12) + 2 * wordSize))
4426       add(sp, sp, framesize - 2 * wordSize);
4427     else {
4428       mov(rscratch1, framesize - 2 * wordSize);
4429       add(sp, sp, rscratch1);
4430     }
4431     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4432   }
4433 }
4434 
4435 #ifdef COMPILER2
4436 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4437 
4438 // Search for str1 in str2 and return index or -1
4439 void MacroAssembler::string_indexof(Register str2, Register str1,
4440                                     Register cnt2, Register cnt1,
4441                                     Register tmp1, Register tmp2,
4442                                     Register tmp3, Register tmp4,
4443                                     Register tmp5, Register tmp6,
4444                                     int icnt1, Register result, int ae) {
4445   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4446   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4447 
4448   Register ch1 = rscratch1;
4449   Register ch2 = rscratch2;
4450   Register cnt1tmp = tmp1;
4451   Register cnt2tmp = tmp2;
4452   Register cnt1_neg = cnt1;
4453   Register cnt2_neg = cnt2;
4454   Register result_tmp = tmp4;
4455 
4456   bool isL = ae == StrIntrinsicNode::LL;
4457 
4458   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4459   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4460   int str1_chr_shift = str1_isL ? 0:1;
4461   int str2_chr_shift = str2_isL ? 0:1;
4462   int str1_chr_size = str1_isL ? 1:2;
4463   int str2_chr_size = str2_isL ? 1:2;
4464   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4465                                       (chr_insn)&MacroAssembler::ldrh;
4466   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4467                                       (chr_insn)&MacroAssembler::ldrh;
4468   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4469   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4470 
4471   // Note, inline_string_indexOf() generates checks:
4472   // if (substr.count > string.count) return -1;
4473   // if (substr.count == 0) return 0;
4474 
4475   // We have two strings, a source string in str2, cnt2 and a pattern string
4476   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4477 
4478   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4479   // With a small pattern and source we use linear scan.
4480 
4481   if (icnt1 == -1) {
4482     sub(result_tmp, cnt2, cnt1);
4483     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4484     br(LT, LINEARSEARCH);
4485     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4486     subs(zr, cnt1, 256);
4487     lsr(tmp1, cnt2, 2);
4488     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4489     br(GE, LINEARSTUB);
4490   }
4491 
4492 // The Boyer Moore alogorithm is based on the description here:-
4493 //
4494 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4495 //
4496 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4497 // and the 'Good Suffix' rule.
4498 //
4499 // These rules are essentially heuristics for how far we can shift the
4500 // pattern along the search string.
4501 //
4502 // The implementation here uses the 'Bad Character' rule only because of the
4503 // complexity of initialisation for the 'Good Suffix' rule.
4504 //
4505 // This is also known as the Boyer-Moore-Horspool algorithm:-
4506 //
4507 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4508 //
4509 // This particular implementation has few java-specific optimizations.
4510 //
4511 // #define ASIZE 256
4512 //
4513 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4514 //       int i, j;
4515 //       unsigned c;
4516 //       unsigned char bc[ASIZE];
4517 //
4518 //       /* Preprocessing */
4519 //       for (i = 0; i < ASIZE; ++i)
4520 //          bc[i] = m;
4521 //       for (i = 0; i < m - 1; ) {
4522 //          c = x[i];
4523 //          ++i;
4524 //          // c < 256 for Latin1 string, so, no need for branch
4525 //          #ifdef PATTERN_STRING_IS_LATIN1
4526 //          bc[c] = m - i;
4527 //          #else
4528 //          if (c < ASIZE) bc[c] = m - i;
4529 //          #endif
4530 //       }
4531 //
4532 //       /* Searching */
4533 //       j = 0;
4534 //       while (j <= n - m) {
4535 //          c = y[i+j];
4536 //          if (x[m-1] == c)
4537 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4538 //          if (i < 0) return j;
4539 //          // c < 256 for Latin1 string, so, no need for branch
4540 //          #ifdef SOURCE_STRING_IS_LATIN1
4541 //          // LL case: (c< 256) always true. Remove branch
4542 //          j += bc[y[j+m-1]];
4543 //          #endif
4544 //          #ifndef PATTERN_STRING_IS_UTF
4545 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4546 //          if (c < ASIZE)
4547 //            j += bc[y[j+m-1]];
4548 //          else
4549 //            j += 1
4550 //          #endif
4551 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4552 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4553 //          if (c < ASIZE)
4554 //            j += bc[y[j+m-1]];
4555 //          else
4556 //            j += m
4557 //          #endif
4558 //       }
4559 //    }
4560 
4561   if (icnt1 == -1) {
4562     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4563         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4564     Register cnt1end = tmp2;
4565     Register str2end = cnt2;
4566     Register skipch = tmp2;
4567 
4568     // str1 length is >=8, so, we can read at least 1 register for cases when
4569     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4570     // UL case. We'll re-read last character in inner pre-loop code to have
4571     // single outer pre-loop load
4572     const int firstStep = isL ? 7 : 3;
4573 
4574     const int ASIZE = 256;
4575     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4576     sub(sp, sp, ASIZE);
4577     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4578     mov(ch1, sp);
4579     BIND(BM_INIT_LOOP);
4580       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4581       subs(tmp5, tmp5, 1);
4582       br(GT, BM_INIT_LOOP);
4583 
4584       sub(cnt1tmp, cnt1, 1);
4585       mov(tmp5, str2);
4586       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4587       sub(ch2, cnt1, 1);
4588       mov(tmp3, str1);
4589     BIND(BCLOOP);
4590       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4591       if (!str1_isL) {
4592         subs(zr, ch1, ASIZE);
4593         br(HS, BCSKIP);
4594       }
4595       strb(ch2, Address(sp, ch1));
4596     BIND(BCSKIP);
4597       subs(ch2, ch2, 1);
4598       br(GT, BCLOOP);
4599 
4600       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4601       if (str1_isL == str2_isL) {
4602         // load last 8 bytes (8LL/4UU symbols)
4603         ldr(tmp6, Address(tmp6, -wordSize));
4604       } else {
4605         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4606         // convert Latin1 to UTF. We'll have to wait until load completed, but
4607         // it's still faster than per-character loads+checks
4608         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4609         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4610         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4611         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4612         orr(ch2, ch1, ch2, LSL, 16);
4613         orr(tmp6, tmp6, tmp3, LSL, 48);
4614         orr(tmp6, tmp6, ch2, LSL, 16);
4615       }
4616     BIND(BMLOOPSTR2);
4617       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4618       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4619       if (str1_isL == str2_isL) {
4620         // re-init tmp3. It's for free because it's executed in parallel with
4621         // load above. Alternative is to initialize it before loop, but it'll
4622         // affect performance on in-order systems with 2 or more ld/st pipelines
4623         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4624       }
4625       if (!isL) { // UU/UL case
4626         lsl(ch2, cnt1tmp, 1); // offset in bytes
4627       }
4628       cmp(tmp3, skipch);
4629       br(NE, BMSKIP);
4630       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4631       mov(ch1, tmp6);
4632       if (isL) {
4633         b(BMLOOPSTR1_AFTER_LOAD);
4634       } else {
4635         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4636         b(BMLOOPSTR1_CMP);
4637       }
4638     BIND(BMLOOPSTR1);
4639       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4640       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4641     BIND(BMLOOPSTR1_AFTER_LOAD);
4642       subs(cnt1tmp, cnt1tmp, 1);
4643       br(LT, BMLOOPSTR1_LASTCMP);
4644     BIND(BMLOOPSTR1_CMP);
4645       cmp(ch1, ch2);
4646       br(EQ, BMLOOPSTR1);
4647     BIND(BMSKIP);
4648       if (!isL) {
4649         // if we've met UTF symbol while searching Latin1 pattern, then we can
4650         // skip cnt1 symbols
4651         if (str1_isL != str2_isL) {
4652           mov(result_tmp, cnt1);
4653         } else {
4654           mov(result_tmp, 1);
4655         }
4656         subs(zr, skipch, ASIZE);
4657         br(HS, BMADV);
4658       }
4659       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4660     BIND(BMADV);
4661       sub(cnt1tmp, cnt1, 1);
4662       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4663       cmp(str2, str2end);
4664       br(LE, BMLOOPSTR2);
4665       add(sp, sp, ASIZE);
4666       b(NOMATCH);
4667     BIND(BMLOOPSTR1_LASTCMP);
4668       cmp(ch1, ch2);
4669       br(NE, BMSKIP);
4670     BIND(BMMATCH);
4671       sub(result, str2, tmp5);
4672       if (!str2_isL) lsr(result, result, 1);
4673       add(sp, sp, ASIZE);
4674       b(DONE);
4675 
4676     BIND(LINEARSTUB);
4677     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4678     br(LT, LINEAR_MEDIUM);
4679     mov(result, zr);
4680     RuntimeAddress stub = NULL;
4681     if (isL) {
4682       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4683       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4684     } else if (str1_isL) {
4685       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4686        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4687     } else {
4688       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4689       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4690     }
4691     trampoline_call(stub);
4692     b(DONE);
4693   }
4694 
4695   BIND(LINEARSEARCH);
4696   {
4697     Label DO1, DO2, DO3;
4698 
4699     Register str2tmp = tmp2;
4700     Register first = tmp3;
4701 
4702     if (icnt1 == -1)
4703     {
4704         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4705 
4706         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4707         br(LT, DOSHORT);
4708       BIND(LINEAR_MEDIUM);
4709         (this->*str1_load_1chr)(first, Address(str1));
4710         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4711         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4712         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4713         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4714 
4715       BIND(FIRST_LOOP);
4716         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4717         cmp(first, ch2);
4718         br(EQ, STR1_LOOP);
4719       BIND(STR2_NEXT);
4720         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4721         br(LE, FIRST_LOOP);
4722         b(NOMATCH);
4723 
4724       BIND(STR1_LOOP);
4725         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4726         add(cnt2tmp, cnt2_neg, str2_chr_size);
4727         br(GE, MATCH);
4728 
4729       BIND(STR1_NEXT);
4730         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4731         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4732         cmp(ch1, ch2);
4733         br(NE, STR2_NEXT);
4734         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4735         add(cnt2tmp, cnt2tmp, str2_chr_size);
4736         br(LT, STR1_NEXT);
4737         b(MATCH);
4738 
4739       BIND(DOSHORT);
4740       if (str1_isL == str2_isL) {
4741         cmp(cnt1, (u1)2);
4742         br(LT, DO1);
4743         br(GT, DO3);
4744       }
4745     }
4746 
4747     if (icnt1 == 4) {
4748       Label CH1_LOOP;
4749 
4750         (this->*load_4chr)(ch1, str1);
4751         sub(result_tmp, cnt2, 4);
4752         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4753         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4754 
4755       BIND(CH1_LOOP);
4756         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4757         cmp(ch1, ch2);
4758         br(EQ, MATCH);
4759         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4760         br(LE, CH1_LOOP);
4761         b(NOMATCH);
4762       }
4763 
4764     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4765       Label CH1_LOOP;
4766 
4767       BIND(DO2);
4768         (this->*load_2chr)(ch1, str1);
4769         if (icnt1 == 2) {
4770           sub(result_tmp, cnt2, 2);
4771         }
4772         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4773         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4774       BIND(CH1_LOOP);
4775         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4776         cmp(ch1, ch2);
4777         br(EQ, MATCH);
4778         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4779         br(LE, CH1_LOOP);
4780         b(NOMATCH);
4781     }
4782 
4783     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4784       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4785 
4786       BIND(DO3);
4787         (this->*load_2chr)(first, str1);
4788         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4789         if (icnt1 == 3) {
4790           sub(result_tmp, cnt2, 3);
4791         }
4792         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4793         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4794       BIND(FIRST_LOOP);
4795         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4796         cmpw(first, ch2);
4797         br(EQ, STR1_LOOP);
4798       BIND(STR2_NEXT);
4799         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4800         br(LE, FIRST_LOOP);
4801         b(NOMATCH);
4802 
4803       BIND(STR1_LOOP);
4804         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4805         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4806         cmp(ch1, ch2);
4807         br(NE, STR2_NEXT);
4808         b(MATCH);
4809     }
4810 
4811     if (icnt1 == -1 || icnt1 == 1) {
4812       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4813 
4814       BIND(DO1);
4815         (this->*str1_load_1chr)(ch1, str1);
4816         cmp(cnt2, (u1)8);
4817         br(LT, DO1_SHORT);
4818 
4819         sub(result_tmp, cnt2, 8/str2_chr_size);
4820         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4821         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4822         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4823 
4824         if (str2_isL) {
4825           orr(ch1, ch1, ch1, LSL, 8);
4826         }
4827         orr(ch1, ch1, ch1, LSL, 16);
4828         orr(ch1, ch1, ch1, LSL, 32);
4829       BIND(CH1_LOOP);
4830         ldr(ch2, Address(str2, cnt2_neg));
4831         eor(ch2, ch1, ch2);
4832         sub(tmp1, ch2, tmp3);
4833         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4834         bics(tmp1, tmp1, tmp2);
4835         br(NE, HAS_ZERO);
4836         adds(cnt2_neg, cnt2_neg, 8);
4837         br(LT, CH1_LOOP);
4838 
4839         cmp(cnt2_neg, (u1)8);
4840         mov(cnt2_neg, 0);
4841         br(LT, CH1_LOOP);
4842         b(NOMATCH);
4843 
4844       BIND(HAS_ZERO);
4845         rev(tmp1, tmp1);
4846         clz(tmp1, tmp1);
4847         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4848         b(MATCH);
4849 
4850       BIND(DO1_SHORT);
4851         mov(result_tmp, cnt2);
4852         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4853         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4854       BIND(DO1_LOOP);
4855         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4856         cmpw(ch1, ch2);
4857         br(EQ, MATCH);
4858         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4859         br(LT, DO1_LOOP);
4860     }
4861   }
4862   BIND(NOMATCH);
4863     mov(result, -1);
4864     b(DONE);
4865   BIND(MATCH);
4866     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4867   BIND(DONE);
4868 }
4869 
4870 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4871 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4872 
4873 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4874                                          Register ch, Register result,
4875                                          Register tmp1, Register tmp2, Register tmp3)
4876 {
4877   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4878   Register cnt1_neg = cnt1;
4879   Register ch1 = rscratch1;
4880   Register result_tmp = rscratch2;
4881 
4882   cmp(cnt1, (u1)4);
4883   br(LT, DO1_SHORT);
4884 
4885   orr(ch, ch, ch, LSL, 16);
4886   orr(ch, ch, ch, LSL, 32);
4887 
4888   sub(cnt1, cnt1, 4);
4889   mov(result_tmp, cnt1);
4890   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4891   sub(cnt1_neg, zr, cnt1, LSL, 1);
4892 
4893   mov(tmp3, 0x0001000100010001);
4894 
4895   BIND(CH1_LOOP);
4896     ldr(ch1, Address(str1, cnt1_neg));
4897     eor(ch1, ch, ch1);
4898     sub(tmp1, ch1, tmp3);
4899     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4900     bics(tmp1, tmp1, tmp2);
4901     br(NE, HAS_ZERO);
4902     adds(cnt1_neg, cnt1_neg, 8);
4903     br(LT, CH1_LOOP);
4904 
4905     cmp(cnt1_neg, (u1)8);
4906     mov(cnt1_neg, 0);
4907     br(LT, CH1_LOOP);
4908     b(NOMATCH);
4909 
4910   BIND(HAS_ZERO);
4911     rev(tmp1, tmp1);
4912     clz(tmp1, tmp1);
4913     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4914     b(MATCH);
4915 
4916   BIND(DO1_SHORT);
4917     mov(result_tmp, cnt1);
4918     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4919     sub(cnt1_neg, zr, cnt1, LSL, 1);
4920   BIND(DO1_LOOP);
4921     ldrh(ch1, Address(str1, cnt1_neg));
4922     cmpw(ch, ch1);
4923     br(EQ, MATCH);
4924     adds(cnt1_neg, cnt1_neg, 2);
4925     br(LT, DO1_LOOP);
4926   BIND(NOMATCH);
4927     mov(result, -1);
4928     b(DONE);
4929   BIND(MATCH);
4930     add(result, result_tmp, cnt1_neg, ASR, 1);
4931   BIND(DONE);
4932 }
4933 
4934 // Compare strings.
4935 void MacroAssembler::string_compare(Register str1, Register str2,
4936     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4937     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4938   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4939       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4940       SHORT_LOOP_START, TAIL_CHECK;
4941 
4942   const u1 STUB_THRESHOLD = 64 + 8;
4943   bool isLL = ae == StrIntrinsicNode::LL;
4944   bool isLU = ae == StrIntrinsicNode::LU;
4945   bool isUL = ae == StrIntrinsicNode::UL;
4946 
4947   bool str1_isL = isLL || isLU;
4948   bool str2_isL = isLL || isUL;
4949 
4950   int str1_chr_shift = str1_isL ? 0 : 1;
4951   int str2_chr_shift = str2_isL ? 0 : 1;
4952   int str1_chr_size = str1_isL ? 1 : 2;
4953   int str2_chr_size = str2_isL ? 1 : 2;
4954   int minCharsInWord = isLL ? wordSize : wordSize/2;
4955 
4956   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4957   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4958                                       (chr_insn)&MacroAssembler::ldrh;
4959   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4960                                       (chr_insn)&MacroAssembler::ldrh;
4961   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4962                             (uxt_insn)&MacroAssembler::uxthw;
4963 
4964   BLOCK_COMMENT("string_compare {");
4965 
4966   // Bizzarely, the counts are passed in bytes, regardless of whether they
4967   // are L or U strings, however the result is always in characters.
4968   if (!str1_isL) asrw(cnt1, cnt1, 1);
4969   if (!str2_isL) asrw(cnt2, cnt2, 1);
4970 
4971   // Compute the minimum of the string lengths and save the difference.
4972   subsw(result, cnt1, cnt2);
4973   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4974 
4975   // A very short string
4976   cmpw(cnt2, minCharsInWord);
4977   br(Assembler::LE, SHORT_STRING);
4978 
4979   // Compare longwords
4980   // load first parts of strings and finish initialization while loading
4981   {
4982     if (str1_isL == str2_isL) { // LL or UU
4983       ldr(tmp1, Address(str1));
4984       cmp(str1, str2);
4985       br(Assembler::EQ, DONE);
4986       ldr(tmp2, Address(str2));
4987       cmp(cnt2, STUB_THRESHOLD);
4988       br(GE, STUB);
4989       subsw(cnt2, cnt2, minCharsInWord);
4990       br(EQ, TAIL_CHECK);
4991       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4992       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4993       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4994     } else if (isLU) {
4995       ldrs(vtmp, Address(str1));
4996       cmp(str1, str2);
4997       br(Assembler::EQ, DONE);
4998       ldr(tmp2, Address(str2));
4999       cmp(cnt2, STUB_THRESHOLD);
5000       br(GE, STUB);
5001       subw(cnt2, cnt2, 4);
5002       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
5003       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
5004       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
5005       zip1(vtmp, T8B, vtmp, vtmpZ);
5006       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
5007       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
5008       add(cnt1, cnt1, 4);
5009       fmovd(tmp1, vtmp);
5010     } else { // UL case
5011       ldr(tmp1, Address(str1));
5012       cmp(str1, str2);
5013       br(Assembler::EQ, DONE);
5014       ldrs(vtmp, Address(str2));
5015       cmp(cnt2, STUB_THRESHOLD);
5016       br(GE, STUB);
5017       subw(cnt2, cnt2, 4);
5018       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
5019       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
5020       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
5021       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
5022       zip1(vtmp, T8B, vtmp, vtmpZ);
5023       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
5024       add(cnt1, cnt1, 8);
5025       fmovd(tmp2, vtmp);
5026     }
5027     adds(cnt2, cnt2, isUL ? 4 : 8);
5028     br(GE, TAIL);
5029     eor(rscratch2, tmp1, tmp2);
5030     cbnz(rscratch2, DIFFERENCE);
5031     // main loop
5032     bind(NEXT_WORD);
5033     if (str1_isL == str2_isL) {
5034       ldr(tmp1, Address(str1, cnt2));
5035       ldr(tmp2, Address(str2, cnt2));
5036       adds(cnt2, cnt2, 8);
5037     } else if (isLU) {
5038       ldrs(vtmp, Address(str1, cnt1));
5039       ldr(tmp2, Address(str2, cnt2));
5040       add(cnt1, cnt1, 4);
5041       zip1(vtmp, T8B, vtmp, vtmpZ);
5042       fmovd(tmp1, vtmp);
5043       adds(cnt2, cnt2, 8);
5044     } else { // UL
5045       ldrs(vtmp, Address(str2, cnt2));
5046       ldr(tmp1, Address(str1, cnt1));
5047       zip1(vtmp, T8B, vtmp, vtmpZ);
5048       add(cnt1, cnt1, 8);
5049       fmovd(tmp2, vtmp);
5050       adds(cnt2, cnt2, 4);
5051     }
5052     br(GE, TAIL);
5053 
5054     eor(rscratch2, tmp1, tmp2);
5055     cbz(rscratch2, NEXT_WORD);
5056     b(DIFFERENCE);
5057     bind(TAIL);
5058     eor(rscratch2, tmp1, tmp2);
5059     cbnz(rscratch2, DIFFERENCE);
5060     // Last longword.  In the case where length == 4 we compare the
5061     // same longword twice, but that's still faster than another
5062     // conditional branch.
5063     if (str1_isL == str2_isL) {
5064       ldr(tmp1, Address(str1));
5065       ldr(tmp2, Address(str2));
5066     } else if (isLU) {
5067       ldrs(vtmp, Address(str1));
5068       ldr(tmp2, Address(str2));
5069       zip1(vtmp, T8B, vtmp, vtmpZ);
5070       fmovd(tmp1, vtmp);
5071     } else { // UL
5072       ldrs(vtmp, Address(str2));
5073       ldr(tmp1, Address(str1));
5074       zip1(vtmp, T8B, vtmp, vtmpZ);
5075       fmovd(tmp2, vtmp);
5076     }
5077     bind(TAIL_CHECK);
5078     eor(rscratch2, tmp1, tmp2);
5079     cbz(rscratch2, DONE);
5080 
5081     // Find the first different characters in the longwords and
5082     // compute their difference.
5083     bind(DIFFERENCE);
5084     rev(rscratch2, rscratch2);
5085     clz(rscratch2, rscratch2);
5086     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5087     lsrv(tmp1, tmp1, rscratch2);
5088     (this->*ext_chr)(tmp1, tmp1);
5089     lsrv(tmp2, tmp2, rscratch2);
5090     (this->*ext_chr)(tmp2, tmp2);
5091     subw(result, tmp1, tmp2);
5092     b(DONE);
5093   }
5094 
5095   bind(STUB);
5096     RuntimeAddress stub = NULL;
5097     switch(ae) {
5098       case StrIntrinsicNode::LL:
5099         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5100         break;
5101       case StrIntrinsicNode::UU:
5102         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5103         break;
5104       case StrIntrinsicNode::LU:
5105         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5106         break;
5107       case StrIntrinsicNode::UL:
5108         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5109         break;
5110       default:
5111         ShouldNotReachHere();
5112      }
5113     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5114     trampoline_call(stub);
5115     b(DONE);
5116 
5117   bind(SHORT_STRING);
5118   // Is the minimum length zero?
5119   cbz(cnt2, DONE);
5120   // arrange code to do most branches while loading and loading next characters
5121   // while comparing previous
5122   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5123   subs(cnt2, cnt2, 1);
5124   br(EQ, SHORT_LAST_INIT);
5125   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5126   b(SHORT_LOOP_START);
5127   bind(SHORT_LOOP);
5128   subs(cnt2, cnt2, 1);
5129   br(EQ, SHORT_LAST);
5130   bind(SHORT_LOOP_START);
5131   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5132   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5133   cmp(tmp1, cnt1);
5134   br(NE, SHORT_LOOP_TAIL);
5135   subs(cnt2, cnt2, 1);
5136   br(EQ, SHORT_LAST2);
5137   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5138   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5139   cmp(tmp2, rscratch1);
5140   br(EQ, SHORT_LOOP);
5141   sub(result, tmp2, rscratch1);
5142   b(DONE);
5143   bind(SHORT_LOOP_TAIL);
5144   sub(result, tmp1, cnt1);
5145   b(DONE);
5146   bind(SHORT_LAST2);
5147   cmp(tmp2, rscratch1);
5148   br(EQ, DONE);
5149   sub(result, tmp2, rscratch1);
5150 
5151   b(DONE);
5152   bind(SHORT_LAST_INIT);
5153   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5154   bind(SHORT_LAST);
5155   cmp(tmp1, cnt1);
5156   br(EQ, DONE);
5157   sub(result, tmp1, cnt1);
5158 
5159   bind(DONE);
5160 
5161   BLOCK_COMMENT("} string_compare");
5162 }
5163 #endif // COMPILER2
5164 
5165 // This method checks if provided byte array contains byte with highest bit set.
5166 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5167     // Simple and most common case of aligned small array which is not at the
5168     // end of memory page is placed here. All other cases are in stub.
5169     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5170     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5171     assert_different_registers(ary1, len, result);
5172 
5173     cmpw(len, 0);
5174     br(LE, SET_RESULT);
5175     cmpw(len, 4 * wordSize);
5176     br(GE, STUB_LONG); // size > 32 then go to stub
5177 
5178     int shift = 64 - exact_log2(os::vm_page_size());
5179     lsl(rscratch1, ary1, shift);
5180     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5181     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5182     br(CS, STUB); // at the end of page then go to stub
5183     subs(len, len, wordSize);
5184     br(LT, END);
5185 
5186   BIND(LOOP);
5187     ldr(rscratch1, Address(post(ary1, wordSize)));
5188     tst(rscratch1, UPPER_BIT_MASK);
5189     br(NE, SET_RESULT);
5190     subs(len, len, wordSize);
5191     br(GE, LOOP);
5192     cmpw(len, -wordSize);
5193     br(EQ, SET_RESULT);
5194 
5195   BIND(END);
5196     ldr(result, Address(ary1));
5197     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5198     lslv(result, result, len);
5199     tst(result, UPPER_BIT_MASK);
5200     b(SET_RESULT);
5201 
5202   BIND(STUB);
5203     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5204     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5205     trampoline_call(has_neg);
5206     b(DONE);
5207 
5208   BIND(STUB_LONG);
5209     RuntimeAddress has_neg_long =  RuntimeAddress(
5210             StubRoutines::aarch64::has_negatives_long());
5211     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5212     trampoline_call(has_neg_long);
5213     b(DONE);
5214 
5215   BIND(SET_RESULT);
5216     cset(result, NE); // set true or false
5217 
5218   BIND(DONE);
5219 }
5220 
5221 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5222                                    Register tmp4, Register tmp5, Register result,
5223                                    Register cnt1, int elem_size) {
5224   Label DONE, SAME;
5225   Register tmp1 = rscratch1;
5226   Register tmp2 = rscratch2;
5227   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5228   int elem_per_word = wordSize/elem_size;
5229   int log_elem_size = exact_log2(elem_size);
5230   int length_offset = arrayOopDesc::length_offset_in_bytes();
5231   int base_offset
5232     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5233   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5234 
5235   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5236   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5237 
5238 #ifndef PRODUCT
5239   {
5240     const char kind = (elem_size == 2) ? 'U' : 'L';
5241     char comment[64];
5242     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5243     BLOCK_COMMENT(comment);
5244   }
5245 #endif
5246 
5247   // if (a1 == a2)
5248   //     return true;
5249   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5250   br(EQ, SAME);
5251 
5252   if (UseSimpleArrayEquals) {
5253     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5254     // if (a1 == null || a2 == null)
5255     //     return false;
5256     // a1 & a2 == 0 means (some-pointer is null) or
5257     // (very-rare-or-even-probably-impossible-pointer-values)
5258     // so, we can save one branch in most cases
5259     tst(a1, a2);
5260     mov(result, false);
5261     br(EQ, A_MIGHT_BE_NULL);
5262     // if (a1.length != a2.length)
5263     //      return false;
5264     bind(A_IS_NOT_NULL);
5265     ldrw(cnt1, Address(a1, length_offset));
5266     ldrw(cnt2, Address(a2, length_offset));
5267     eorw(tmp5, cnt1, cnt2);
5268     cbnzw(tmp5, DONE);
5269     lea(a1, Address(a1, base_offset));
5270     lea(a2, Address(a2, base_offset));
5271     // Check for short strings, i.e. smaller than wordSize.
5272     subs(cnt1, cnt1, elem_per_word);
5273     br(Assembler::LT, SHORT);
5274     // Main 8 byte comparison loop.
5275     bind(NEXT_WORD); {
5276       ldr(tmp1, Address(post(a1, wordSize)));
5277       ldr(tmp2, Address(post(a2, wordSize)));
5278       subs(cnt1, cnt1, elem_per_word);
5279       eor(tmp5, tmp1, tmp2);
5280       cbnz(tmp5, DONE);
5281     } br(GT, NEXT_WORD);
5282     // Last longword.  In the case where length == 4 we compare the
5283     // same longword twice, but that's still faster than another
5284     // conditional branch.
5285     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5286     // length == 4.
5287     if (log_elem_size > 0)
5288       lsl(cnt1, cnt1, log_elem_size);
5289     ldr(tmp3, Address(a1, cnt1));
5290     ldr(tmp4, Address(a2, cnt1));
5291     eor(tmp5, tmp3, tmp4);
5292     cbnz(tmp5, DONE);
5293     b(SAME);
5294     bind(A_MIGHT_BE_NULL);
5295     // in case both a1 and a2 are not-null, proceed with loads
5296     cbz(a1, DONE);
5297     cbz(a2, DONE);
5298     b(A_IS_NOT_NULL);
5299     bind(SHORT);
5300 
5301     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5302     {
5303       ldrw(tmp1, Address(post(a1, 4)));
5304       ldrw(tmp2, Address(post(a2, 4)));
5305       eorw(tmp5, tmp1, tmp2);
5306       cbnzw(tmp5, DONE);
5307     }
5308     bind(TAIL03);
5309     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5310     {
5311       ldrh(tmp3, Address(post(a1, 2)));
5312       ldrh(tmp4, Address(post(a2, 2)));
5313       eorw(tmp5, tmp3, tmp4);
5314       cbnzw(tmp5, DONE);
5315     }
5316     bind(TAIL01);
5317     if (elem_size == 1) { // Only needed when comparing byte arrays.
5318       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5319       {
5320         ldrb(tmp1, a1);
5321         ldrb(tmp2, a2);
5322         eorw(tmp5, tmp1, tmp2);
5323         cbnzw(tmp5, DONE);
5324       }
5325     }
5326   } else {
5327     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5328         CSET_EQ, LAST_CHECK;
5329     mov(result, false);
5330     cbz(a1, DONE);
5331     ldrw(cnt1, Address(a1, length_offset));
5332     cbz(a2, DONE);
5333     ldrw(cnt2, Address(a2, length_offset));
5334     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5335     // faster to perform another branch before comparing a1 and a2
5336     cmp(cnt1, (u1)elem_per_word);
5337     br(LE, SHORT); // short or same
5338     ldr(tmp3, Address(pre(a1, base_offset)));
5339     subs(zr, cnt1, stubBytesThreshold);
5340     br(GE, STUB);
5341     ldr(tmp4, Address(pre(a2, base_offset)));
5342     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5343     cmp(cnt2, cnt1);
5344     br(NE, DONE);
5345 
5346     // Main 16 byte comparison loop with 2 exits
5347     bind(NEXT_DWORD); {
5348       ldr(tmp1, Address(pre(a1, wordSize)));
5349       ldr(tmp2, Address(pre(a2, wordSize)));
5350       subs(cnt1, cnt1, 2 * elem_per_word);
5351       br(LE, TAIL);
5352       eor(tmp4, tmp3, tmp4);
5353       cbnz(tmp4, DONE);
5354       ldr(tmp3, Address(pre(a1, wordSize)));
5355       ldr(tmp4, Address(pre(a2, wordSize)));
5356       cmp(cnt1, (u1)elem_per_word);
5357       br(LE, TAIL2);
5358       cmp(tmp1, tmp2);
5359     } br(EQ, NEXT_DWORD);
5360     b(DONE);
5361 
5362     bind(TAIL);
5363     eor(tmp4, tmp3, tmp4);
5364     eor(tmp2, tmp1, tmp2);
5365     lslv(tmp2, tmp2, tmp5);
5366     orr(tmp5, tmp4, tmp2);
5367     cmp(tmp5, zr);
5368     b(CSET_EQ);
5369 
5370     bind(TAIL2);
5371     eor(tmp2, tmp1, tmp2);
5372     cbnz(tmp2, DONE);
5373     b(LAST_CHECK);
5374 
5375     bind(STUB);
5376     ldr(tmp4, Address(pre(a2, base_offset)));
5377     cmp(cnt2, cnt1);
5378     br(NE, DONE);
5379     if (elem_size == 2) { // convert to byte counter
5380       lsl(cnt1, cnt1, 1);
5381     }
5382     eor(tmp5, tmp3, tmp4);
5383     cbnz(tmp5, DONE);
5384     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5385     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5386     trampoline_call(stub);
5387     b(DONE);
5388 
5389     bind(EARLY_OUT);
5390     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5391     // so, if a2 == null => return false(0), else return true, so we can return a2
5392     mov(result, a2);
5393     b(DONE);
5394     bind(SHORT);
5395     cmp(cnt2, cnt1);
5396     br(NE, DONE);
5397     cbz(cnt1, SAME);
5398     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5399     ldr(tmp3, Address(a1, base_offset));
5400     ldr(tmp4, Address(a2, base_offset));
5401     bind(LAST_CHECK);
5402     eor(tmp4, tmp3, tmp4);
5403     lslv(tmp5, tmp4, tmp5);
5404     cmp(tmp5, zr);
5405     bind(CSET_EQ);
5406     cset(result, EQ);
5407     b(DONE);
5408   }
5409 
5410   bind(SAME);
5411   mov(result, true);
5412   // That's it.
5413   bind(DONE);
5414 
5415   BLOCK_COMMENT("} array_equals");
5416 }
5417 
5418 // Compare Strings
5419 
5420 // For Strings we're passed the address of the first characters in a1
5421 // and a2 and the length in cnt1.
5422 // elem_size is the element size in bytes: either 1 or 2.
5423 // There are two implementations.  For arrays >= 8 bytes, all
5424 // comparisons (including the final one, which may overlap) are
5425 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5426 // halfword, then a short, and then a byte.
5427 
5428 void MacroAssembler::string_equals(Register a1, Register a2,
5429                                    Register result, Register cnt1, int elem_size)
5430 {
5431   Label SAME, DONE, SHORT, NEXT_WORD;
5432   Register tmp1 = rscratch1;
5433   Register tmp2 = rscratch2;
5434   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5435 
5436   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5437   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5438 
5439 #ifndef PRODUCT
5440   {
5441     const char kind = (elem_size == 2) ? 'U' : 'L';
5442     char comment[64];
5443     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5444     BLOCK_COMMENT(comment);
5445   }
5446 #endif
5447 
5448   mov(result, false);
5449 
5450   // Check for short strings, i.e. smaller than wordSize.
5451   subs(cnt1, cnt1, wordSize);
5452   br(Assembler::LT, SHORT);
5453   // Main 8 byte comparison loop.
5454   bind(NEXT_WORD); {
5455     ldr(tmp1, Address(post(a1, wordSize)));
5456     ldr(tmp2, Address(post(a2, wordSize)));
5457     subs(cnt1, cnt1, wordSize);
5458     eor(tmp1, tmp1, tmp2);
5459     cbnz(tmp1, DONE);
5460   } br(GT, NEXT_WORD);
5461   // Last longword.  In the case where length == 4 we compare the
5462   // same longword twice, but that's still faster than another
5463   // conditional branch.
5464   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5465   // length == 4.
5466   ldr(tmp1, Address(a1, cnt1));
5467   ldr(tmp2, Address(a2, cnt1));
5468   eor(tmp2, tmp1, tmp2);
5469   cbnz(tmp2, DONE);
5470   b(SAME);
5471 
5472   bind(SHORT);
5473   Label TAIL03, TAIL01;
5474 
5475   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5476   {
5477     ldrw(tmp1, Address(post(a1, 4)));
5478     ldrw(tmp2, Address(post(a2, 4)));
5479     eorw(tmp1, tmp1, tmp2);
5480     cbnzw(tmp1, DONE);
5481   }
5482   bind(TAIL03);
5483   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5484   {
5485     ldrh(tmp1, Address(post(a1, 2)));
5486     ldrh(tmp2, Address(post(a2, 2)));
5487     eorw(tmp1, tmp1, tmp2);
5488     cbnzw(tmp1, DONE);
5489   }
5490   bind(TAIL01);
5491   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5492     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5493     {
5494       ldrb(tmp1, a1);
5495       ldrb(tmp2, a2);
5496       eorw(tmp1, tmp1, tmp2);
5497       cbnzw(tmp1, DONE);
5498     }
5499   }
5500   // Arrays are equal.
5501   bind(SAME);
5502   mov(result, true);
5503 
5504   // That's it.
5505   bind(DONE);
5506   BLOCK_COMMENT("} string_equals");
5507 }
5508 
5509 
5510 // The size of the blocks erased by the zero_blocks stub.  We must
5511 // handle anything smaller than this ourselves in zero_words().
5512 const int MacroAssembler::zero_words_block_size = 8;
5513 
5514 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5515 // possible, handling small word counts locally and delegating
5516 // anything larger to the zero_blocks stub.  It is expanded many times
5517 // in compiled code, so it is important to keep it short.
5518 
5519 // ptr:   Address of a buffer to be zeroed.
5520 // cnt:   Count in HeapWords.
5521 //
5522 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5523 void MacroAssembler::zero_words(Register ptr, Register cnt)
5524 {
5525   assert(is_power_of_2(zero_words_block_size), "adjust this");
5526   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5527 
5528   BLOCK_COMMENT("zero_words {");
5529   cmp(cnt, (u1)zero_words_block_size);
5530   Label around;
5531   br(LO, around);
5532   {
5533     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5534     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5535     if (StubRoutines::aarch64::complete()) {
5536       trampoline_call(zero_blocks);
5537     } else {
5538       bl(zero_blocks);
5539     }
5540   }
5541   bind(around);
5542   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5543     Label l;
5544     tbz(cnt, exact_log2(i), l);
5545     for (int j = 0; j < i; j += 2) {
5546       stp(zr, zr, post(ptr, 16));
5547     }
5548     bind(l);
5549   }
5550   {
5551     Label l;
5552     tbz(cnt, 0, l);
5553     str(zr, Address(ptr));
5554     bind(l);
5555   }
5556   BLOCK_COMMENT("} zero_words");
5557 }
5558 
5559 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5560 // cnt:          Immediate count in HeapWords.
5561 #define SmallArraySize (18 * BytesPerLong)
5562 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5563 {
5564   BLOCK_COMMENT("zero_words {");
5565   int i = cnt & 1;  // store any odd word to start
5566   if (i) str(zr, Address(base));
5567 
5568   if (cnt <= SmallArraySize / BytesPerLong) {
5569     for (; i < (int)cnt; i += 2)
5570       stp(zr, zr, Address(base, i * wordSize));
5571   } else {
5572     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5573     int remainder = cnt % (2 * unroll);
5574     for (; i < remainder; i += 2)
5575       stp(zr, zr, Address(base, i * wordSize));
5576 
5577     Label loop;
5578     Register cnt_reg = rscratch1;
5579     Register loop_base = rscratch2;
5580     cnt = cnt - remainder;
5581     mov(cnt_reg, cnt);
5582     // adjust base and prebias by -2 * wordSize so we can pre-increment
5583     add(loop_base, base, (remainder - 2) * wordSize);
5584     bind(loop);
5585     sub(cnt_reg, cnt_reg, 2 * unroll);
5586     for (i = 1; i < unroll; i++)
5587       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5588     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5589     cbnz(cnt_reg, loop);
5590   }
5591   BLOCK_COMMENT("} zero_words");
5592 }
5593 
5594 // Zero blocks of memory by using DC ZVA.
5595 //
5596 // Aligns the base address first sufficently for DC ZVA, then uses
5597 // DC ZVA repeatedly for every full block.  cnt is the size to be
5598 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5599 // in cnt.
5600 //
5601 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5602 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5603 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5604   Register tmp = rscratch1;
5605   Register tmp2 = rscratch2;
5606   int zva_length = VM_Version::zva_length();
5607   Label initial_table_end, loop_zva;
5608   Label fini;
5609 
5610   // Base must be 16 byte aligned. If not just return and let caller handle it
5611   tst(base, 0x0f);
5612   br(Assembler::NE, fini);
5613   // Align base with ZVA length.
5614   neg(tmp, base);
5615   andr(tmp, tmp, zva_length - 1);
5616 
5617   // tmp: the number of bytes to be filled to align the base with ZVA length.
5618   add(base, base, tmp);
5619   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5620   adr(tmp2, initial_table_end);
5621   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5622   br(tmp2);
5623 
5624   for (int i = -zva_length + 16; i < 0; i += 16)
5625     stp(zr, zr, Address(base, i));
5626   bind(initial_table_end);
5627 
5628   sub(cnt, cnt, zva_length >> 3);
5629   bind(loop_zva);
5630   dc(Assembler::ZVA, base);
5631   subs(cnt, cnt, zva_length >> 3);
5632   add(base, base, zva_length);
5633   br(Assembler::GE, loop_zva);
5634   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5635   bind(fini);
5636 }
5637 
5638 // base:   Address of a buffer to be filled, 8 bytes aligned.
5639 // cnt:    Count in 8-byte unit.
5640 // value:  Value to be filled with.
5641 // base will point to the end of the buffer after filling.
5642 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5643 {
5644 //  Algorithm:
5645 //
5646 //    scratch1 = cnt & 7;
5647 //    cnt -= scratch1;
5648 //    p += scratch1;
5649 //    switch (scratch1) {
5650 //      do {
5651 //        cnt -= 8;
5652 //          p[-8] = v;
5653 //        case 7:
5654 //          p[-7] = v;
5655 //        case 6:
5656 //          p[-6] = v;
5657 //          // ...
5658 //        case 1:
5659 //          p[-1] = v;
5660 //        case 0:
5661 //          p += 8;
5662 //      } while (cnt);
5663 //    }
5664 
5665   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5666 
5667   Label fini, skip, entry, loop;
5668   const int unroll = 8; // Number of stp instructions we'll unroll
5669 
5670   cbz(cnt, fini);
5671   tbz(base, 3, skip);
5672   str(value, Address(post(base, 8)));
5673   sub(cnt, cnt, 1);
5674   bind(skip);
5675 
5676   andr(rscratch1, cnt, (unroll-1) * 2);
5677   sub(cnt, cnt, rscratch1);
5678   add(base, base, rscratch1, Assembler::LSL, 3);
5679   adr(rscratch2, entry);
5680   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5681   br(rscratch2);
5682 
5683   bind(loop);
5684   add(base, base, unroll * 16);
5685   for (int i = -unroll; i < 0; i++)
5686     stp(value, value, Address(base, i * 16));
5687   bind(entry);
5688   subs(cnt, cnt, unroll * 2);
5689   br(Assembler::GE, loop);
5690 
5691   tbz(cnt, 0, fini);
5692   str(value, Address(post(base, 8)));
5693   bind(fini);
5694 }
5695 
5696 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5697 // java/lang/StringUTF16.compress.
5698 void MacroAssembler::encode_iso_array(Register src, Register dst,
5699                       Register len, Register result,
5700                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5701                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5702 {
5703     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5704         NEXT_32_START, NEXT_32_PRFM_START;
5705     Register tmp1 = rscratch1, tmp2 = rscratch2;
5706 
5707       mov(result, len); // Save initial len
5708 
5709 #ifndef BUILTIN_SIM
5710       cmp(len, (u1)8); // handle shortest strings first
5711       br(LT, LOOP_1);
5712       cmp(len, (u1)32);
5713       br(LT, NEXT_8);
5714       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5715       // to convert chars to bytes
5716       if (SoftwarePrefetchHintDistance >= 0) {
5717         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5718         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5719         br(LE, NEXT_32_START);
5720         b(NEXT_32_PRFM_START);
5721         BIND(NEXT_32_PRFM);
5722           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5723         BIND(NEXT_32_PRFM_START);
5724           prfm(Address(src, SoftwarePrefetchHintDistance));
5725           orr(v4, T16B, Vtmp1, Vtmp2);
5726           orr(v5, T16B, Vtmp3, Vtmp4);
5727           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5728           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5729           uzp2(v5, T16B, v4, v5); // high bytes
5730           umov(tmp2, v5, D, 1);
5731           fmovd(tmp1, v5);
5732           orr(tmp1, tmp1, tmp2);
5733           cbnz(tmp1, LOOP_8);
5734           stpq(Vtmp1, Vtmp3, dst);
5735           sub(len, len, 32);
5736           add(dst, dst, 32);
5737           add(src, src, 64);
5738           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5739           br(GE, NEXT_32_PRFM);
5740           cmp(len, (u1)32);
5741           br(LT, LOOP_8);
5742         BIND(NEXT_32);
5743           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5744         BIND(NEXT_32_START);
5745       } else {
5746         BIND(NEXT_32);
5747           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5748       }
5749       prfm(Address(src, SoftwarePrefetchHintDistance));
5750       uzp1(v4, T16B, Vtmp1, Vtmp2);
5751       uzp1(v5, T16B, Vtmp3, Vtmp4);
5752       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5753       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5754       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5755       umov(tmp2, Vtmp1, D, 1);
5756       fmovd(tmp1, Vtmp1);
5757       orr(tmp1, tmp1, tmp2);
5758       cbnz(tmp1, LOOP_8);
5759       stpq(v4, v5, dst);
5760       sub(len, len, 32);
5761       add(dst, dst, 32);
5762       add(src, src, 64);
5763       cmp(len, (u1)32);
5764       br(GE, NEXT_32);
5765       cbz(len, DONE);
5766 
5767     BIND(LOOP_8);
5768       cmp(len, (u1)8);
5769       br(LT, LOOP_1);
5770     BIND(NEXT_8);
5771       ld1(Vtmp1, T8H, src);
5772       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5773       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5774       fmovd(tmp1, Vtmp3);
5775       cbnz(tmp1, NEXT_1);
5776       strd(Vtmp2, dst);
5777 
5778       sub(len, len, 8);
5779       add(dst, dst, 8);
5780       add(src, src, 16);
5781       cmp(len, (u1)8);
5782       br(GE, NEXT_8);
5783 
5784     BIND(LOOP_1);
5785 #endif
5786     cbz(len, DONE);
5787     BIND(NEXT_1);
5788       ldrh(tmp1, Address(post(src, 2)));
5789       tst(tmp1, 0xff00);
5790       br(NE, SET_RESULT);
5791       strb(tmp1, Address(post(dst, 1)));
5792       subs(len, len, 1);
5793       br(GT, NEXT_1);
5794 
5795     BIND(SET_RESULT);
5796       sub(result, result, len); // Return index where we stopped
5797                                 // Return len == 0 if we processed all
5798                                 // characters
5799     BIND(DONE);
5800 }
5801 
5802 
5803 // Inflate byte[] array to char[].
5804 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5805                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5806                                         Register tmp4) {
5807   Label big, done, after_init, to_stub;
5808 
5809   assert_different_registers(src, dst, len, tmp4, rscratch1);
5810 
5811   fmovd(vtmp1, zr);
5812   lsrw(tmp4, len, 3);
5813   bind(after_init);
5814   cbnzw(tmp4, big);
5815   // Short string: less than 8 bytes.
5816   {
5817     Label loop, tiny;
5818 
5819     cmpw(len, 4);
5820     br(LT, tiny);
5821     // Use SIMD to do 4 bytes.
5822     ldrs(vtmp2, post(src, 4));
5823     zip1(vtmp3, T8B, vtmp2, vtmp1);
5824     subw(len, len, 4);
5825     strd(vtmp3, post(dst, 8));
5826 
5827     cbzw(len, done);
5828 
5829     // Do the remaining bytes by steam.
5830     bind(loop);
5831     ldrb(tmp4, post(src, 1));
5832     strh(tmp4, post(dst, 2));
5833     subw(len, len, 1);
5834 
5835     bind(tiny);
5836     cbnz(len, loop);
5837 
5838     b(done);
5839   }
5840 
5841   if (SoftwarePrefetchHintDistance >= 0) {
5842     bind(to_stub);
5843       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5844       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5845       trampoline_call(stub);
5846       b(after_init);
5847   }
5848 
5849   // Unpack the bytes 8 at a time.
5850   bind(big);
5851   {
5852     Label loop, around, loop_last, loop_start;
5853 
5854     if (SoftwarePrefetchHintDistance >= 0) {
5855       const int large_loop_threshold = (64 + 16)/8;
5856       ldrd(vtmp2, post(src, 8));
5857       andw(len, len, 7);
5858       cmp(tmp4, (u1)large_loop_threshold);
5859       br(GE, to_stub);
5860       b(loop_start);
5861 
5862       bind(loop);
5863       ldrd(vtmp2, post(src, 8));
5864       bind(loop_start);
5865       subs(tmp4, tmp4, 1);
5866       br(EQ, loop_last);
5867       zip1(vtmp2, T16B, vtmp2, vtmp1);
5868       ldrd(vtmp3, post(src, 8));
5869       st1(vtmp2, T8H, post(dst, 16));
5870       subs(tmp4, tmp4, 1);
5871       zip1(vtmp3, T16B, vtmp3, vtmp1);
5872       st1(vtmp3, T8H, post(dst, 16));
5873       br(NE, loop);
5874       b(around);
5875       bind(loop_last);
5876       zip1(vtmp2, T16B, vtmp2, vtmp1);
5877       st1(vtmp2, T8H, post(dst, 16));
5878       bind(around);
5879       cbz(len, done);
5880     } else {
5881       andw(len, len, 7);
5882       bind(loop);
5883       ldrd(vtmp2, post(src, 8));
5884       sub(tmp4, tmp4, 1);
5885       zip1(vtmp3, T16B, vtmp2, vtmp1);
5886       st1(vtmp3, T8H, post(dst, 16));
5887       cbnz(tmp4, loop);
5888     }
5889   }
5890 
5891   // Do the tail of up to 8 bytes.
5892   add(src, src, len);
5893   ldrd(vtmp3, Address(src, -8));
5894   add(dst, dst, len, ext::uxtw, 1);
5895   zip1(vtmp3, T16B, vtmp3, vtmp1);
5896   strq(vtmp3, Address(dst, -16));
5897 
5898   bind(done);
5899 }
5900 
5901 // Compress char[] array to byte[].
5902 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5903                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5904                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5905                                          Register result) {
5906   encode_iso_array(src, dst, len, result,
5907                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5908   cmp(len, zr);
5909   csel(result, result, zr, EQ);
5910 }
5911 
5912 // get_thread() can be called anywhere inside generated code so we
5913 // need to save whatever non-callee save context might get clobbered
5914 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5915 // the call setup code.
5916 //
5917 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5918 //
5919 void MacroAssembler::get_thread(Register dst) {
5920   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5921   push(saved_regs, sp);
5922 
5923   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5924   blrt(lr, 1, 0, 1);
5925   if (dst != c_rarg0) {
5926     mov(dst, c_rarg0);
5927   }
5928 
5929   pop(saved_regs, sp);
5930 }
5931 
5932 // C2 compiled method's prolog code 
5933 // Moved here from aarch64.ad to support Valhalla code belows
5934 void MacroAssembler::verified_entry(Compile* C, int sp_inc) {
5935 
5936 // n.b. frame size includes space for return pc and rfp
5937   const long framesize = C->frame_size_in_bytes();
5938   assert(framesize % (2 * wordSize) == 0, "must preserve 2 * wordSize alignment");
5939 
5940   // insert a nop at the start of the prolog so we can patch in a
5941   // branch if we need to invalidate the method later
5942   nop();
5943 
5944   int bangsize = C->bang_size_in_bytes();
5945   if (C->need_stack_bang(bangsize) && UseStackBanging)
5946      generate_stack_overflow_check(bangsize);
5947 
5948   build_frame(framesize);
5949 
5950   if (NotifySimulator) {
5951     notify(Assembler::method_entry);
5952   }
5953 
5954   if (VerifyStackAtCalls) {
5955     Unimplemented();
5956   }
5957 }
5958 
5959 void MacroAssembler::unpack_value_args(Compile* C, bool receiver_only) {
5960   // Called from MachVEP node
5961   unimplemented("Support for ValueTypePassFieldsAsArgs and ValueTypeReturnedAsFields is not implemented");
5962 }
5963 
5964 void MacroAssembler::store_value_type_fields_to_buf(ciValueKlass* vk) {
5965   super_call_VM_leaf(StubRoutines::store_value_type_fields_to_buf());
5966 }