< prev index next >

src/cpu/x86/vm/stubGenerator_x86_64.cpp

Print this page




3664     __ aesdec(xmm_result, xmm_key11);
3665     load_key(xmm_temp, key, 0xc0);
3666     __ aesdec(xmm_result, xmm_temp);
3667     load_key(xmm_temp, key, 0xd0);
3668     __ aesdec(xmm_result, xmm_temp);
3669     load_key(xmm_temp, key, 0xe0);     // 256-bit key goes up to e0
3670     __ aesdec(xmm_result, xmm_temp);
3671     __ aesdeclast(xmm_result, xmm_key_last);          // xmm15 came from key+0
3672     __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
3673     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);  // store into the next 16 bytes of output
3674     // no need to store r to memory until we exit
3675     __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);  // set up next r vector with cipher input from this block
3676     __ addptr(pos, AESBlockSize);
3677     __ subptr(len_reg, AESBlockSize);
3678     __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
3679     __ jmp(L_exit);
3680 
3681     return start;
3682   }
3683 









































































































































































3684   /**
3685    *  Arguments:
3686    *
3687    * Inputs:
3688    *   c_rarg0   - int crc
3689    *   c_rarg1   - byte* buf
3690    *   c_rarg2   - int length
3691    *
3692    * Ouput:
3693    *       rax   - int crc result
3694    */
3695   address generate_updateBytesCRC32() {
3696     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
3697 
3698     __ align(CodeEntryAlignment);
3699     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3700 
3701     address start = __ pc();
3702     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
3703     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)


4101     StubRoutines::x86::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
4102     StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
4103     StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
4104 
4105     // support for verify_oop (must happen after universe_init)
4106     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
4107 
4108     // arraycopy stubs used by compilers
4109     generate_arraycopy_stubs();
4110 
4111     generate_math_stubs();
4112 
4113     // don't bother generating these AES intrinsic stubs unless global flag is set
4114     if (UseAESIntrinsics) {
4115       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
4116 
4117       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4118       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4119       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4120       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();







4121     }
4122 
4123     // Safefetch stubs.
4124     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4125                                                        &StubRoutines::_safefetch32_fault_pc,
4126                                                        &StubRoutines::_safefetch32_continuation_pc);
4127     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4128                                                        &StubRoutines::_safefetchN_fault_pc,
4129                                                        &StubRoutines::_safefetchN_continuation_pc);
4130 #ifdef COMPILER2
4131     if (UseMultiplyToLenIntrinsic) {
4132       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4133     }
4134     if (UseSquareToLenIntrinsic) {
4135       StubRoutines::_squareToLen = generate_squareToLen();
4136     }
4137     if (UseMulAddIntrinsic) {
4138       StubRoutines::_mulAdd = generate_mulAdd();
4139     }
4140 #endif


3664     __ aesdec(xmm_result, xmm_key11);
3665     load_key(xmm_temp, key, 0xc0);
3666     __ aesdec(xmm_result, xmm_temp);
3667     load_key(xmm_temp, key, 0xd0);
3668     __ aesdec(xmm_result, xmm_temp);
3669     load_key(xmm_temp, key, 0xe0);     // 256-bit key goes up to e0
3670     __ aesdec(xmm_result, xmm_temp);
3671     __ aesdeclast(xmm_result, xmm_key_last);          // xmm15 came from key+0
3672     __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
3673     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);  // store into the next 16 bytes of output
3674     // no need to store r to memory until we exit
3675     __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);  // set up next r vector with cipher input from this block
3676     __ addptr(pos, AESBlockSize);
3677     __ subptr(len_reg, AESBlockSize);
3678     __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
3679     __ jmp(L_exit);
3680 
3681     return start;
3682   }
3683 
3684 
3685   // byte swap x86 long
3686   address generate_ghash_long_swap_mask() {
3687     __ align(CodeEntryAlignment);
3688     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
3689     address start = __ pc();
3690     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
3691     __ emit_data64(0x0706050403020100, relocInfo::none );
3692   return start;
3693   }
3694 
3695   // byte swap x86 byte array
3696   address generate_ghash_byte_swap_mask() {
3697     __ align(CodeEntryAlignment);
3698     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
3699     address start = __ pc();
3700     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
3701     __ emit_data64(0x0001020304050607, relocInfo::none );
3702   return start;
3703   }
3704 
3705   /* Single and multi-block ghash operations */
3706   address generate_ghash_processBlocks() {
3707     __ align(CodeEntryAlignment);
3708     Label L_ghash_loop, L_exit;
3709     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3710     address start = __ pc();
3711 
3712     const Register state        = c_rarg0;
3713     const Register subkeyH      = c_rarg1;
3714     const Register data         = c_rarg2;
3715     const Register blocks       = c_rarg3;
3716 
3717 #ifdef _WIN64
3718     const int XMM_REG_LAST  = 10;
3719 #endif
3720 
3721     const XMMRegister xmm_temp0 = xmm0;
3722     const XMMRegister xmm_temp1 = xmm1;
3723     const XMMRegister xmm_temp2 = xmm2;
3724     const XMMRegister xmm_temp3 = xmm3;
3725     const XMMRegister xmm_temp4 = xmm4;
3726     const XMMRegister xmm_temp5 = xmm5;
3727     const XMMRegister xmm_temp6 = xmm6;
3728     const XMMRegister xmm_temp7 = xmm7;
3729     const XMMRegister xmm_temp8 = xmm8;
3730     const XMMRegister xmm_temp9 = xmm9;
3731     const XMMRegister xmm_temp10 = xmm10;
3732 
3733     __ enter();
3734 
3735 #ifdef _WIN64
3736     // save the xmm registers which must be preserved 6-10
3737     __ subptr(rsp, -rsp_after_call_off * wordSize);
3738     for (int i = 6; i <= XMM_REG_LAST; i++) {
3739       __ movdqu(xmm_save(i), as_XMMRegister(i));
3740     }
3741 #endif
3742 
3743     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
3744 
3745     __ movdqu(xmm_temp0, Address(state, 0));
3746     __ pshufb(xmm_temp0, xmm_temp10);
3747 
3748 
3749     __ BIND(L_ghash_loop);
3750     __ movdqu(xmm_temp2, Address(data, 0));
3751     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
3752 
3753     __ movdqu(xmm_temp1, Address(subkeyH, 0));
3754     __ pshufb(xmm_temp1, xmm_temp10);
3755 
3756     __ pxor(xmm_temp0, xmm_temp2); 
3757 
3758     //
3759     // Multiply with the hash key
3760     //
3761     __ movdqu(xmm_temp3, xmm_temp0);
3762     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
3763     __ movdqu(xmm_temp4, xmm_temp0);
3764     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
3765 
3766     __ movdqu(xmm_temp5, xmm_temp0);
3767     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
3768     __ movdqu(xmm_temp6, xmm_temp0);
3769     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
3770 
3771     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
3772 
3773     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
3774     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
3775     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
3776     __ pxor(xmm_temp3, xmm_temp5);
3777     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
3778                                 // of the carry-less multiplication of
3779                                 // xmm0 by xmm1.
3780 
3781     // We shift the result of the multiplication by one bit position
3782     // to the left to cope for the fact that the bits are reversed.
3783     __ movdqu(xmm_temp7, xmm_temp3);
3784     __ movdqu(xmm_temp8, xmm_temp6);
3785     __ pslld(xmm_temp3, 1);
3786     __ pslld(xmm_temp6, 1);
3787     __ psrld(xmm_temp7, 31);
3788     __ psrld(xmm_temp8, 31);
3789     __ movdqu(xmm_temp9, xmm_temp7);
3790     __ pslldq(xmm_temp8, 4);
3791     __ pslldq(xmm_temp7, 4);
3792     __ psrldq(xmm_temp9, 12);
3793     __ por(xmm_temp3, xmm_temp7);
3794     __ por(xmm_temp6, xmm_temp8);
3795     __ por(xmm_temp6, xmm_temp9);
3796 
3797     //
3798     // First phase of the reduction
3799     //
3800     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
3801     // independently.
3802     __ movdqu(xmm_temp7, xmm_temp3);
3803     __ movdqu(xmm_temp8, xmm_temp3);
3804     __ movdqu(xmm_temp9, xmm_temp3);
3805     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
3806     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
3807     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
3808     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
3809     __ pxor(xmm_temp7, xmm_temp9);
3810     __ movdqu(xmm_temp8, xmm_temp7);
3811     __ pslldq(xmm_temp7, 12);
3812     __ psrldq(xmm_temp8, 4);
3813     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
3814 
3815     //
3816     // Second phase of the reduction
3817     //
3818     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
3819     // shift operations.
3820     __ movdqu(xmm_temp2, xmm_temp3);
3821     __ movdqu(xmm_temp4, xmm_temp3);
3822     __ movdqu(xmm_temp5, xmm_temp3);
3823     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
3824     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
3825     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
3826     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
3827     __ pxor(xmm_temp2, xmm_temp5);
3828     __ pxor(xmm_temp2, xmm_temp8);
3829     __ pxor(xmm_temp3, xmm_temp2);
3830     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
3831 
3832     __ decrement(blocks);
3833     __ jcc(Assembler::zero, L_exit);
3834     __ movdqu(xmm_temp0, xmm_temp6);
3835     __ addptr(data, 16);
3836     __ jmp(L_ghash_loop);
3837 
3838     __ BIND(L_exit);
3839     __ pshufb(xmm_temp6, xmm_temp10);           // Byte swap 16-byte result
3840     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
3841 
3842 #ifdef _WIN64
3843     // restore xmm regs belonging to calling function
3844     for (int i = 6; i <= XMM_REG_LAST; i++) {
3845       __ movdqu(as_XMMRegister(i), xmm_save(i));
3846     }
3847 #endif
3848     __ leave();
3849     __ ret(0);
3850     return start;
3851   }
3852 
3853   /**
3854    *  Arguments:
3855    *
3856    * Inputs:
3857    *   c_rarg0   - int crc
3858    *   c_rarg1   - byte* buf
3859    *   c_rarg2   - int length
3860    *
3861    * Ouput:
3862    *       rax   - int crc result
3863    */
3864   address generate_updateBytesCRC32() {
3865     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
3866 
3867     __ align(CodeEntryAlignment);
3868     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3869 
3870     address start = __ pc();
3871     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
3872     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)


4270     StubRoutines::x86::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
4271     StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
4272     StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
4273 
4274     // support for verify_oop (must happen after universe_init)
4275     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
4276 
4277     // arraycopy stubs used by compilers
4278     generate_arraycopy_stubs();
4279 
4280     generate_math_stubs();
4281 
4282     // don't bother generating these AES intrinsic stubs unless global flag is set
4283     if (UseAESIntrinsics) {
4284       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
4285 
4286       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4287       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4288       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4289       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
4290     }
4291 
4292     // Generate GHASH intrinsics code
4293     if (UseGHASHIntrinsics) {
4294       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
4295       StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
4296       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4297     }
4298 
4299     // Safefetch stubs.
4300     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4301                                                        &StubRoutines::_safefetch32_fault_pc,
4302                                                        &StubRoutines::_safefetch32_continuation_pc);
4303     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4304                                                        &StubRoutines::_safefetchN_fault_pc,
4305                                                        &StubRoutines::_safefetchN_continuation_pc);
4306 #ifdef COMPILER2
4307     if (UseMultiplyToLenIntrinsic) {
4308       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4309     }
4310     if (UseSquareToLenIntrinsic) {
4311       StubRoutines::_squareToLen = generate_squareToLen();
4312     }
4313     if (UseMulAddIntrinsic) {
4314       StubRoutines::_mulAdd = generate_mulAdd();
4315     }
4316 #endif
< prev index next >