src/cpu/x86/vm/stubGenerator_x86_32.cpp
Print this page
rev 4130 : 8004835: Improve AES intrinsics on x86
Summary: Enable AES intrinsics on non-AVX cpus, group together aes instructions in crypto stubs.
Reviewed-by: roland, twisti
rev 4134 : 8005544: Use 256bit YMM registers in arraycopy stubs on x86
Summary: Use YMM registers in arraycopy and array_fill stubs.
Reviewed-by: roland, twisti
@@ -823,20 +823,26 @@
// Copy 64-byte chunks
__ jmpb(L_copy_64_bytes);
__ align(OptoLoopAlignment);
__ BIND(L_copy_64_bytes_loop);
- if(UseUnalignedLoadStores) {
+ if (UseUnalignedLoadStores) {
+ if (UseAVX >= 2) {
+ __ vmovdqu(xmm0, Address(from, 0));
+ __ vmovdqu(Address(from, to_from, Address::times_1, 0), xmm0);
+ __ vmovdqu(xmm1, Address(from, 32));
+ __ vmovdqu(Address(from, to_from, Address::times_1, 32), xmm1);
+ } else {
__ movdqu(xmm0, Address(from, 0));
__ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
__ movdqu(xmm1, Address(from, 16));
__ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
__ movdqu(xmm2, Address(from, 32));
__ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
__ movdqu(xmm3, Address(from, 48));
__ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
-
+ }
} else {
__ movq(xmm0, Address(from, 0));
__ movq(Address(from, to_from, Address::times_1, 0), xmm0);
__ movq(xmm1, Address(from, 8));
__ movq(Address(from, to_from, Address::times_1, 8), xmm1);
@@ -2201,66 +2207,93 @@
// c_rarg0 - source byte array address
// c_rarg1 - destination byte array address
// c_rarg2 - K (key) in little endian int array
//
address generate_aescrypt_encryptBlock() {
- assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+ assert(UseAES, "need AES instructions and misaligned SSE support");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
Label L_doLast;
address start = __ pc();
- const Register from = rsi; // source array address
+ const Register from = rdx; // source array address
const Register to = rdx; // destination array address
const Register key = rcx; // key array address
const Register keylen = rax;
const Address from_param(rbp, 8+0);
const Address to_param (rbp, 8+4);
const Address key_param (rbp, 8+8);
const XMMRegister xmm_result = xmm0;
- const XMMRegister xmm_temp = xmm1;
- const XMMRegister xmm_key_shuf_mask = xmm2;
+ const XMMRegister xmm_key_shuf_mask = xmm1;
+ const XMMRegister xmm_temp1 = xmm2;
+ const XMMRegister xmm_temp2 = xmm3;
+ const XMMRegister xmm_temp3 = xmm4;
+ const XMMRegister xmm_temp4 = xmm5;
__ enter(); // required for proper stackwalking of RuntimeStub frame
- __ push(rsi);
- __ movptr(from , from_param);
- __ movptr(to , to_param);
- __ movptr(key , key_param);
+ __ movptr(from, from_param);
+ __ movptr(key, key_param);
+ // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
__ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
- // keylen = # of 32-bit words, convert to 128-bit words
- __ shrl(keylen, 2);
- __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more
__ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
__ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input
+ __ movptr(to, to_param);
// For encryption, the java expanded key ordering is just what we need
- load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
- __ pxor(xmm_result, xmm_temp);
- for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
- aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
- }
- load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
- __ cmpl(keylen, 0);
- __ jcc(Assembler::equal, L_doLast);
- __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys
- aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
- load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
- __ subl(keylen, 2);
- __ jcc(Assembler::equal, L_doLast);
- __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys
- aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
- load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+ load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
+ __ pxor(xmm_result, xmm_temp1);
+ load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
+ load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
+ load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
+ load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
+
+ __ aesenc(xmm_result, xmm_temp1);
+ __ aesenc(xmm_result, xmm_temp2);
+ __ aesenc(xmm_result, xmm_temp3);
+ __ aesenc(xmm_result, xmm_temp4);
+
+ load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
+ load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
+ load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
+ load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
+
+ __ aesenc(xmm_result, xmm_temp1);
+ __ aesenc(xmm_result, xmm_temp2);
+ __ aesenc(xmm_result, xmm_temp3);
+ __ aesenc(xmm_result, xmm_temp4);
+
+ load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
+ load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
+
+ __ cmpl(keylen, 44);
+ __ jccb(Assembler::equal, L_doLast);
+
+ __ aesenc(xmm_result, xmm_temp1);
+ __ aesenc(xmm_result, xmm_temp2);
+
+ load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
+ load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
+
+ __ cmpl(keylen, 52);
+ __ jccb(Assembler::equal, L_doLast);
+
+ __ aesenc(xmm_result, xmm_temp1);
+ __ aesenc(xmm_result, xmm_temp2);
+
+ load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
+ load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
+
__ BIND(L_doLast);
- __ aesenclast(xmm_result, xmm_temp);
+ __ aesenc(xmm_result, xmm_temp1);
+ __ aesenclast(xmm_result, xmm_temp2);
__ movdqu(Address(to, 0), xmm_result); // store the result
__ xorptr(rax, rax); // return 0
- __ pop(rsi);
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
@@ -2272,70 +2305,95 @@
// c_rarg0 - source byte array address
// c_rarg1 - destination byte array address
// c_rarg2 - K (key) in little endian int array
//
address generate_aescrypt_decryptBlock() {
- assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+ assert(UseAES, "need AES instructions and misaligned SSE support");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
Label L_doLast;
address start = __ pc();
- const Register from = rsi; // source array address
+ const Register from = rdx; // source array address
const Register to = rdx; // destination array address
const Register key = rcx; // key array address
const Register keylen = rax;
const Address from_param(rbp, 8+0);
const Address to_param (rbp, 8+4);
const Address key_param (rbp, 8+8);
const XMMRegister xmm_result = xmm0;
- const XMMRegister xmm_temp = xmm1;
- const XMMRegister xmm_key_shuf_mask = xmm2;
+ const XMMRegister xmm_key_shuf_mask = xmm1;
+ const XMMRegister xmm_temp1 = xmm2;
+ const XMMRegister xmm_temp2 = xmm3;
+ const XMMRegister xmm_temp3 = xmm4;
+ const XMMRegister xmm_temp4 = xmm5;
__ enter(); // required for proper stackwalking of RuntimeStub frame
- __ push(rsi);
- __ movptr(from , from_param);
- __ movptr(to , to_param);
- __ movptr(key , key_param);
+ __ movptr(from, from_param);
+ __ movptr(key, key_param);
+ // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
__ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
- // keylen = # of 32-bit words, convert to 128-bit words
- __ shrl(keylen, 2);
- __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more
__ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
__ movdqu(xmm_result, Address(from, 0));
+ __ movptr(to, to_param);
// for decryption java expanded key ordering is rotated one position from what we want
// so we start from 0x10 here and hit 0x00 last
// we don't know if the key is aligned, hence not using load-execute form
- load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
- __ pxor (xmm_result, xmm_temp);
- for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
- aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
- }
- __ cmpl(keylen, 0);
- __ jcc(Assembler::equal, L_doLast);
- // only in 192 and 256 bit keys
- aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
- aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
- __ subl(keylen, 2);
- __ jcc(Assembler::equal, L_doLast);
- // only in 256 bit keys
- aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
- aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);
+ load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
+ load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
+ load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
+ load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
+ __ pxor (xmm_result, xmm_temp1);
+ __ aesdec(xmm_result, xmm_temp2);
+ __ aesdec(xmm_result, xmm_temp3);
+ __ aesdec(xmm_result, xmm_temp4);
+
+ load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
+ load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
+ load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
+ load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
+
+ __ aesdec(xmm_result, xmm_temp1);
+ __ aesdec(xmm_result, xmm_temp2);
+ __ aesdec(xmm_result, xmm_temp3);
+ __ aesdec(xmm_result, xmm_temp4);
+
+ load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
+ load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
+ load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
+
+ __ cmpl(keylen, 44);
+ __ jccb(Assembler::equal, L_doLast);
+
+ __ aesdec(xmm_result, xmm_temp1);
+ __ aesdec(xmm_result, xmm_temp2);
+
+ load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
+ load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
+
+ __ cmpl(keylen, 52);
+ __ jccb(Assembler::equal, L_doLast);
+
+ __ aesdec(xmm_result, xmm_temp1);
+ __ aesdec(xmm_result, xmm_temp2);
+
+ load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
+ load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
+
__ BIND(L_doLast);
- // for decryption the aesdeclast operation is always on key+0x00
- load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
- __ aesdeclast(xmm_result, xmm_temp);
+ __ aesdec(xmm_result, xmm_temp1);
+ __ aesdec(xmm_result, xmm_temp2);
+ // for decryption the aesdeclast operation is always on key+0x00
+ __ aesdeclast(xmm_result, xmm_temp3);
__ movdqu(Address(to, 0), xmm_result); // store the result
-
__ xorptr(rax, rax); // return 0
- __ pop(rsi);
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
@@ -2367,11 +2425,11 @@
// c_rarg2 - K (key) in little endian int array
// c_rarg3 - r vector byte array address
// c_rarg4 - input length
//
address generate_cipherBlockChaining_encryptAESCrypt() {
- assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+ assert(UseAES, "need AES instructions and misaligned SSE support");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
address start = __ pc();
Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
@@ -2420,11 +2478,11 @@
__ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
__ cmpl(rax, 44);
__ jcc(Assembler::notEqual, L_key_192_256);
// 128 bit code follows here
- __ movptr(pos, 0);
+ __ movl(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_loopTop_128);
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
@@ -2456,11 +2514,11 @@
// here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
__ cmpl(rax, 52);
__ jcc(Assembler::notEqual, L_key_256);
// 192-bit code follows here (could be changed to use more xmm registers)
- __ movptr(pos, 0);
+ __ movl(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_loopTop_192);
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
@@ -2481,11 +2539,11 @@
__ jcc(Assembler::notEqual, L_loopTop_192);
__ jmp(L_exit);
__ BIND(L_key_256);
// 256-bit code follows here (could be changed to use more xmm registers)
- __ movptr(pos, 0);
+ __ movl(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_loopTop_256);
__ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input
__ pxor (xmm_result, xmm_temp); // xor with the current r vector
@@ -2522,11 +2580,11 @@
// c_rarg3 - r vector byte array address
// c_rarg4 - input length
//
address generate_cipherBlockChaining_decryptAESCrypt() {
- assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
+ assert(UseAES, "need AES instructions and misaligned SSE support");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
address start = __ pc();
Label L_exit, L_key_192_256, L_key_256;
@@ -2583,11 +2641,11 @@
__ cmpl(rax, 44);
__ jcc(Assembler::notEqual, L_key_192_256);
// 128-bit code follows here, parallelized
- __ movptr(pos, 0);
+ __ movl(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_singleBlock_loopTop_128);
__ cmpptr(len_reg, 0); // any blocks left??
__ jcc(Assembler::equal, L_exit);
__ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
@@ -2624,11 +2682,11 @@
// here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
__ cmpl(rax, 52);
__ jcc(Assembler::notEqual, L_key_256);
// 192-bit code follows here (could be optimized to use parallelism)
- __ movptr(pos, 0);
+ __ movl(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_singleBlock_loopTop_192);
__ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
__ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
@@ -2649,11 +2707,11 @@
__ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
__ jmp(L_exit);
__ BIND(L_key_256);
// 256-bit code follows here (could be optimized to use parallelism)
- __ movptr(pos, 0);
+ __ movl(pos, 0);
__ align(OptoLoopAlignment);
__ BIND(L_singleBlock_loopTop_256);
__ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
__ pxor (xmm_result, xmm_key_first); // do the aes dec rounds
for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {