1332 1333 // Float masks come from different places depending on platform. 1334 #ifdef _LP64 1335 static address float_signmask() { return StubRoutines::x86::float_sign_mask(); } 1336 static address float_signflip() { return StubRoutines::x86::float_sign_flip(); } 1337 static address double_signmask() { return StubRoutines::x86::double_sign_mask(); } 1338 static address double_signflip() { return StubRoutines::x86::double_sign_flip(); } 1339 static address vector_float_signmask() { return StubRoutines::x86::vector_float_sign_mask(); } 1340 static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip(); } 1341 static address vector_double_signmask() { return StubRoutines::x86::vector_double_sign_mask(); } 1342 static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip(); } 1343 static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); } 1344 static address vector_byte_bitset() { return StubRoutines::x86::vector_byte_bitset(); } 1345 static address vector_long_perm_mask() { return StubRoutines::x86::vector_long_perm_mask(); } 1346 static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); } 1347 static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); } 1348 static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); } 1349 static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); } 1350 static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); } 1351 static address vector_all_ones_mask() { return StubRoutines::x86::vector_all_ones_mask(); } 1352 #else 1353 static address float_signmask() { return (address)float_signmask_pool; } 1354 static address float_signflip() { return (address)float_signflip_pool; } 1355 static address double_signmask() { return (address)double_signmask_pool; } 1356 static address double_signflip() { return (address)double_signflip_pool; } 1357 #endif 1358 1359 1360 const bool Matcher::match_rule_supported(int opcode) { 1361 if (!has_match_rule(opcode)) 1362 return false; 1363 1364 bool ret_value = true; 1365 switch (opcode) { 1366 case Op_PopCountI: 1367 case Op_PopCountL: 1368 if (!UsePopCountInstruction) 1369 ret_value = false; 1370 break; 1371 case Op_PopCountVI: 1511 else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; } 1512 break; 1513 case Op_MinReductionV: 1514 case Op_MaxReductionV: 1515 if ((bt == T_INT || bt == T_LONG || bt == T_BYTE) && UseSSE <= 3) { ret_value = false; } 1516 else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; } 1517 break; 1518 case Op_VectorBlend: 1519 if (UseSSE <= 3 && UseAVX == 0) { ret_value = false; } 1520 else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; } 1521 break; 1522 case Op_VectorTest: 1523 if (UseAVX <= 0) { ret_value = false; } 1524 else if (size_in_bits != 128 && size_in_bits != 256) { ret_value = false; } // Implementation limitation 1525 break; 1526 case Op_VectorLoadMask: 1527 if (UseSSE <= 3) { ret_value = false; } 1528 else if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation 1529 else if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; } // Implementation limitation 1530 break; 1531 case Op_VectorStoreMask: 1532 if (UseAVX < 0) { ret_value = false; } // Implementation limitation 1533 else if ((size_in_bits >= 256 || bt == T_LONG || bt == T_DOUBLE) && UseAVX < 2) { ret_value = false; } // Implementation limitation 1534 else if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation 1535 else if (size_in_bits == 512 && !VM_Version::supports_avx512bw()) { ret_value = false; } // Implementation limitation 1536 break; 1537 case Op_VectorCastB2X: 1538 if (UseAVX <= 0) { ret_value = false; } 1539 else if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; } 1540 break; 1541 case Op_VectorCastS2X: 1542 if (UseAVX <= 0) { ret_value = false; } 1543 else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; } 1544 else if (is_integral_type(bt) && vlen * type2aelembytes(T_SHORT) * BitsPerByte == 256 && UseAVX < 2) { ret_value = false; } 1545 break; 1546 case Op_VectorCastI2X: 1547 if (UseAVX <= 0) { ret_value = false; } 1548 else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; } 1549 else if (is_integral_type(bt) && vlen * type2aelembytes(T_INT) * BitsPerByte == 256 && UseAVX < 2) { ret_value = false; } 1550 break; 23518 ins_pipe( pipe_slow ); 23519 %} 23520 23521 instruct storemask8l(vecD dst, vecZ src, rRegL scratch) %{ 23522 predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 8); 23523 match(Set dst (VectorStoreMask src)); 23524 effect(TEMP scratch); 23525 format %{ "vpcmpeqq k2,$src,0xFFFFFFFF\n\t" 23526 "vmovdqub $dst,k2,0x01010101\t! store mask (8L to 8B)" %} 23527 ins_encode %{ 23528 int vector_len = 2; 23529 KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation. 23530 Assembler::ComparisonPredicate cp = Assembler::eq; 23531 __ evpcmpq(ktmp, k0, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), cp, vector_len, $scratch$$Register); 23532 // The dst is only 128-bit - thus we can do a smaller move. 23533 __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_byte_bitset()), false, 0, $scratch$$Register); 23534 %} 23535 ins_pipe( pipe_slow ); 23536 %} 23537 23538 // --------------------------------- FMA -------------------------------------- 23539 23540 // a * b + c 23541 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{ 23542 predicate(UseFMA && n->as_Vector()->length() == 2); 23543 match(Set c (FmaVD c (Binary a b))); 23544 format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %} 23545 ins_cost(150); 23546 ins_encode %{ 23547 int vector_len = 0; 23548 __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len); 23549 %} 23550 ins_pipe( pipe_slow ); 23551 %} 23552 23553 // a * b + c 23554 instruct vfma2D_mem(vecX a, memory b, vecX c) %{ 23555 predicate(UseFMA && n->as_Vector()->length() == 2); 23556 match(Set c (FmaVD c (Binary a (LoadVector b)))); 23557 format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %} | 1332 1333 // Float masks come from different places depending on platform. 1334 #ifdef _LP64 1335 static address float_signmask() { return StubRoutines::x86::float_sign_mask(); } 1336 static address float_signflip() { return StubRoutines::x86::float_sign_flip(); } 1337 static address double_signmask() { return StubRoutines::x86::double_sign_mask(); } 1338 static address double_signflip() { return StubRoutines::x86::double_sign_flip(); } 1339 static address vector_float_signmask() { return StubRoutines::x86::vector_float_sign_mask(); } 1340 static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip(); } 1341 static address vector_double_signmask() { return StubRoutines::x86::vector_double_sign_mask(); } 1342 static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip(); } 1343 static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); } 1344 static address vector_byte_bitset() { return StubRoutines::x86::vector_byte_bitset(); } 1345 static address vector_long_perm_mask() { return StubRoutines::x86::vector_long_perm_mask(); } 1346 static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); } 1347 static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); } 1348 static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); } 1349 static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); } 1350 static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); } 1351 static address vector_all_ones_mask() { return StubRoutines::x86::vector_all_ones_mask(); } 1352 static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); } 1353 static address vector_int_sizemask() { return StubRoutines::x86::vector_int_size_mask(); } 1354 static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); } 1355 static address vector_short_sizemask() { return StubRoutines::x86::vector_short_size_mask(); } 1356 static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); } 1357 static address vector_long_sizemask() { return StubRoutines::x86::vector_long_size_mask(); } 1358 #else 1359 static address float_signmask() { return (address)float_signmask_pool; } 1360 static address float_signflip() { return (address)float_signflip_pool; } 1361 static address double_signmask() { return (address)double_signmask_pool; } 1362 static address double_signflip() { return (address)double_signflip_pool; } 1363 #endif 1364 1365 1366 const bool Matcher::match_rule_supported(int opcode) { 1367 if (!has_match_rule(opcode)) 1368 return false; 1369 1370 bool ret_value = true; 1371 switch (opcode) { 1372 case Op_PopCountI: 1373 case Op_PopCountL: 1374 if (!UsePopCountInstruction) 1375 ret_value = false; 1376 break; 1377 case Op_PopCountVI: 1517 else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; } 1518 break; 1519 case Op_MinReductionV: 1520 case Op_MaxReductionV: 1521 if ((bt == T_INT || bt == T_LONG || bt == T_BYTE) && UseSSE <= 3) { ret_value = false; } 1522 else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; } 1523 break; 1524 case Op_VectorBlend: 1525 if (UseSSE <= 3 && UseAVX == 0) { ret_value = false; } 1526 else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; } 1527 break; 1528 case Op_VectorTest: 1529 if (UseAVX <= 0) { ret_value = false; } 1530 else if (size_in_bits != 128 && size_in_bits != 256) { ret_value = false; } // Implementation limitation 1531 break; 1532 case Op_VectorLoadMask: 1533 if (UseSSE <= 3) { ret_value = false; } 1534 else if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation 1535 else if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; } // Implementation limitation 1536 break; 1537 case Op_VectorLoadShuffle: 1538 case Op_VectorRearrange: 1539 if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation due to how shuffle is loaded 1540 else if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; } // Implementation limitation 1541 else if (bt == T_BYTE && size_in_bits >= 256 && !VM_Version::supports_avx512vbmi()) { ret_value = false; } // Implementation limitation 1542 else if (bt == T_SHORT && size_in_bits >= 256 && !VM_Version::supports_avx512vlbw()) { ret_value = false; } // Implementation limitation 1543 break; 1544 case Op_VectorStoreMask: 1545 if (UseAVX < 0) { ret_value = false; } // Implementation limitation 1546 else if ((size_in_bits >= 256 || bt == T_LONG || bt == T_DOUBLE) && UseAVX < 2) { ret_value = false; } // Implementation limitation 1547 else if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation 1548 else if (size_in_bits == 512 && !VM_Version::supports_avx512bw()) { ret_value = false; } // Implementation limitation 1549 break; 1550 case Op_VectorCastB2X: 1551 if (UseAVX <= 0) { ret_value = false; } 1552 else if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; } 1553 break; 1554 case Op_VectorCastS2X: 1555 if (UseAVX <= 0) { ret_value = false; } 1556 else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; } 1557 else if (is_integral_type(bt) && vlen * type2aelembytes(T_SHORT) * BitsPerByte == 256 && UseAVX < 2) { ret_value = false; } 1558 break; 1559 case Op_VectorCastI2X: 1560 if (UseAVX <= 0) { ret_value = false; } 1561 else if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) { ret_value = false; } 1562 else if (is_integral_type(bt) && vlen * type2aelembytes(T_INT) * BitsPerByte == 256 && UseAVX < 2) { ret_value = false; } 1563 break; 23531 ins_pipe( pipe_slow ); 23532 %} 23533 23534 instruct storemask8l(vecD dst, vecZ src, rRegL scratch) %{ 23535 predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 8); 23536 match(Set dst (VectorStoreMask src)); 23537 effect(TEMP scratch); 23538 format %{ "vpcmpeqq k2,$src,0xFFFFFFFF\n\t" 23539 "vmovdqub $dst,k2,0x01010101\t! store mask (8L to 8B)" %} 23540 ins_encode %{ 23541 int vector_len = 2; 23542 KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation. 23543 Assembler::ComparisonPredicate cp = Assembler::eq; 23544 __ evpcmpq(ktmp, k0, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), cp, vector_len, $scratch$$Register); 23545 // The dst is only 128-bit - thus we can do a smaller move. 23546 __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_byte_bitset()), false, 0, $scratch$$Register); 23547 %} 23548 ins_pipe( pipe_slow ); 23549 %} 23550 23551 //-------------------------------- LOAD_SHUFFLE ---------------------------------- 23552 23553 instruct loadshuffle8b(vecD dst, vecD src) %{ 23554 predicate(UseSSE > 1 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE); 23555 match(Set dst (VectorLoadShuffle src)); 23556 format %{ "movdqu $dst, $src\t! load shuffle (load 8B for 8BRearrange)" %} 23557 ins_encode %{ 23558 __ movdqu($dst$$XMMRegister, $src$$XMMRegister); 23559 %} 23560 ins_pipe( pipe_slow ); 23561 %} 23562 23563 instruct loadshuffle16b(vecX dst, vecX src) %{ 23564 predicate(UseSSE > 1 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE); 23565 match(Set dst (VectorLoadShuffle src)); 23566 format %{ "movdqu $dst, $src\t! load shuffle (load 16B for 16BRearrange)" %} 23567 ins_encode %{ 23568 __ movdqu($dst$$XMMRegister, $src$$XMMRegister); 23569 %} 23570 ins_pipe( pipe_slow ); 23571 %} 23572 23573 instruct loadshuffle32b(vecY dst, vecY src) %{ 23574 predicate(UseAVX > 0 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE); 23575 match(Set dst (VectorLoadShuffle src)); 23576 format %{ "vmovdqu $dst, $src\t! load shuffle (load 32B for 32BRearrange)" %} 23577 ins_encode %{ 23578 __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); 23579 %} 23580 ins_pipe( pipe_slow ); 23581 %} 23582 23583 instruct loadshuffle64b(vecZ dst, vecZ src) %{ 23584 predicate(UseAVX > 2 && n->as_Vector()->length() == 64 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE); 23585 match(Set dst (VectorLoadShuffle src)); 23586 format %{ "vmovdqu $dst, $src\t! load shuffle (load 64B for 64BRearrange)" %} 23587 ins_encode %{ 23588 __ evmovdqul($dst$$XMMRegister, $src$$XMMRegister, 2); 23589 %} 23590 ins_pipe( pipe_slow ); 23591 %} 23592 23593 instruct loadshuffle4s(vecD dst, vecS src, vecD tmp, vecD tmp2, rRegI scratch) %{ 23594 predicate(UseSSE > 3 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT); 23595 match(Set dst (VectorLoadShuffle src)); 23596 effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP scratch); 23597 format %{ "pmovsxbw $tmp, $src \n\t" 23598 "movdqu $tmp2,0x0002000200020002\n\t" 23599 "pmullw $tmp,$tmp2\n\t" 23600 "movdqu $tmp2,$tmp\n\t" 23601 "psllw $tmp2,0x8\n\t" 23602 "paddb $tmp2,$tmp\n\t" 23603 "movdqu $tmp, 0x0100010001000100 \n\t" 23604 "paddb $tmp2,$tmp\n\t" 23605 "movdqu $dst, $tmp2\t! load shuffle (load 4B for 4SRearrange)" %} 23606 ins_encode %{ 23607 __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister); 23608 __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_sizemask()), $scratch$$Register); 23609 __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister); 23610 __ movdqu($tmp2$$XMMRegister, $tmp$$XMMRegister); 23611 __ psllw($tmp2$$XMMRegister, 0x8); 23612 __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister); 23613 __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register); 23614 __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister); 23615 __ movdqu($dst$$XMMRegister, $tmp2$$XMMRegister); 23616 %} 23617 ins_pipe( pipe_slow ); 23618 %} 23619 23620 instruct loadshuffle8s(vecX dst, vecD src, vecX tmp, vecX tmp2, rRegI scratch) %{ 23621 predicate(UseSSE > 3 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT); 23622 match(Set dst (VectorLoadShuffle src)); 23623 effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP scratch); 23624 format %{ "pmovsxbw $tmp, $src \n\t" 23625 "movdqu $tmp2,0x0002000200020002\n\t" 23626 "pmullw $tmp,$tmp2\n\t" 23627 "movdqu $tmp2,$tmp\n\t" 23628 "psllw $tmp2,0x8\n\t" 23629 "paddb $tmp2,$tmp\n\t" 23630 "movdqu $tmp, 0x0100010001000100 \n\t" 23631 "paddb $tmp2,$tmp\n\t" 23632 "movdqu $dst, $tmp2\t! load shuffle (load 8B for 8SRearrange)" %} 23633 ins_encode %{ 23634 __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister); 23635 __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_sizemask()), $scratch$$Register); 23636 __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister); 23637 __ movdqu($tmp2$$XMMRegister, $tmp$$XMMRegister); 23638 __ psllw($tmp2$$XMMRegister, 0x8); 23639 __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister); 23640 __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register); 23641 __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister); 23642 __ movdqu($dst$$XMMRegister, $tmp2$$XMMRegister); 23643 %} 23644 ins_pipe( pipe_slow ); 23645 %} 23646 23647 instruct loadshuffle16s(vecY dst, vecX src) %{ 23648 predicate(UseAVX >= 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT); 23649 match(Set dst (VectorLoadShuffle src)); 23650 format %{ "vpmovsxbw $dst,$src\t! load shuffle (load 16B for 16SRearrange)" %} 23651 ins_encode %{ 23652 int vector_len = 1; 23653 __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len); 23654 %} 23655 ins_pipe( pipe_slow ); 23656 %} 23657 23658 instruct loadshuffle32s(vecZ dst, vecY src) %{ 23659 predicate(UseAVX > 2 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT); 23660 match(Set dst (VectorLoadShuffle src)); 23661 format %{ "vpmovsxbw $dst,$src\t! load shuffle (load 32B for 32SRearrange)" %} 23662 ins_encode %{ 23663 int vector_len = 2; 23664 __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len); 23665 %} 23666 ins_pipe( pipe_slow ); 23667 %} 23668 23669 instruct loadshuffle4i(vecX dst, vecS src, vecX tmp, vecX tmp2, rRegI scratch) %{ 23670 predicate(UseSSE > 3 && n->as_Vector()->length() == 4 && 23671 (n->bottom_type()->is_vect()->element_basic_type() == T_INT || 23672 n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT)); 23673 match(Set dst (VectorLoadShuffle src)); 23674 effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP scratch); 23675 format %{ "vpmovsxbd $tmp, $src \n\t" 23676 "movdqu $tmp2, 0x0000000400000004 \n\t" 23677 "pmulld $tmp2, $tmp \n\t" 23678 "movdqu $tmp,$tmp2\n\t" 23679 "pslld $tmp2,0x8\n\t" 23680 "paddb $tmp2,$tmp\n\t" 23681 "pslld $tmp2,0x8\n\t" 23682 "paddb $tmp2,$tmp\n\t" 23683 "pslld $tmp2,0x8\n\t" 23684 "paddb $tmp2,$tmp\n\t" 23685 "movdqu $tmp, 0x0302010003020100 \n\t" 23686 "paddb $tmp2,$tmp\n\t" 23687 "movdqu $dst, $tmp2\t! load shuffle (load 4B for 4IRearrange)" %} 23688 ins_encode %{ 23689 __ vpmovsxbd($tmp$$XMMRegister, $src$$XMMRegister, 0); 23690 __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_int_sizemask()), $scratch$$Register); 23691 __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister); 23692 __ movdqu($tmp$$XMMRegister, $tmp2$$XMMRegister); 23693 __ pslld($tmp2$$XMMRegister, 0x8); 23694 __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister); 23695 __ pslld($tmp2$$XMMRegister, 0x8); 23696 __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister); 23697 __ pslld($tmp2$$XMMRegister, 0x8); 23698 __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister); 23699 __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_int_shufflemask()), $scratch$$Register); 23700 __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister); 23701 __ movdqu($dst$$XMMRegister, $tmp2$$XMMRegister); 23702 %} 23703 ins_pipe( pipe_slow ); 23704 %} 23705 23706 instruct loadshuffle8i(vecY dst, vecD src) %{ 23707 predicate(UseAVX >= 1 && n->as_Vector()->length() == 8 && 23708 (n->bottom_type()->is_vect()->element_basic_type() == T_INT || 23709 n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT)); 23710 match(Set dst (VectorLoadShuffle src)); 23711 format %{ "vpmovsxbd $dst, $src\t! load shuffle (load 8B for 8IRearrange)" %} 23712 ins_encode %{ 23713 int vector_len = 1; 23714 __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len); 23715 %} 23716 ins_pipe( pipe_slow ); 23717 %} 23718 23719 instruct loadshuffle16i(vecZ dst, vecX src) %{ 23720 predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && 23721 (n->bottom_type()->is_vect()->element_basic_type() == T_INT || 23722 n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT)); 23723 match(Set dst (VectorLoadShuffle src)); 23724 format %{ "vpmovsxbd $dst, $src\t! load shuffle (load 16B for 16IRearrange)" %} 23725 ins_encode %{ 23726 int vector_len = 2; 23727 __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len); 23728 %} 23729 ins_pipe( pipe_slow ); 23730 %} 23731 23732 instruct loadshuffle4l(vecY dst, vecS src, vecY tmp, vecY tmp2, rRegI scratch) %{ 23733 predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && 23734 (n->bottom_type()->is_vect()->element_basic_type() == T_LONG || 23735 n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE)); 23736 match(Set dst (VectorLoadShuffle src)); 23737 effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP scratch); 23738 format %{ "vpmovsxbd $tmp2, $src \n\t" 23739 "movdqu $tmp, 0x0000000200000002 \n\t" 23740 "pmulld $tmp, $tmp2 \n\t" 23741 "vpmovsxdq $tmp2,$tmp\n\t" 23742 "vpsllq $tmp2,0x20\n\t" 23743 "vpaddd $tmp2,$tmp\n\t" 23744 "vmovdqu $tmp, 0x0000000100000000 \n\t" 23745 "vpaddd $tmp2,$tmp\n\t" 23746 "vmovdqu $dst, $tmp2\t! load shuffle (load 4L for 4LRearrange)" %} 23747 ins_encode %{ 23748 int vector_len = 1; 23749 __ vpmovsxbd($tmp2$$XMMRegister, $src$$XMMRegister, 0); 23750 __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sizemask()), $scratch$$Register); 23751 __ pmulld($tmp$$XMMRegister, $tmp2$$XMMRegister); 23752 __ vpmovsxdq($tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len); 23753 __ vpsllq($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x20, vector_len); 23754 __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len); 23755 __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_shufflemask()), $scratch$$Register); 23756 __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len); 23757 __ vmovdqu($dst$$XMMRegister, $tmp2$$XMMRegister); 23758 %} 23759 ins_pipe( pipe_slow ); 23760 %} 23761 23762 instruct loadshuffle8l(vecZ dst, vecD src, vecZ tmp, vecZ tmp2, rRegI scratch) %{ 23763 predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && 23764 (n->bottom_type()->is_vect()->element_basic_type() == T_LONG || 23765 n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE)); 23766 match(Set dst (VectorLoadShuffle src)); 23767 effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP scratch); 23768 format %{ "vpmovsxbd $tmp2, $src \n\t" 23769 "movdqu $tmp, 0x0000000200000002 \n\t" 23770 "pmulld $tmp, $tmp2\n\t" 23771 "vpmovsxdq $tmp2,$tmp\n\t" 23772 "vpsllq $tmp2,0x20\n\t" 23773 "vpaddd $tmp2,$tmp\n\t" 23774 "vmovdqu $tmp, 0x0000000100000000 \n\t" 23775 "vpaddd $tmp2,$tmp\n\t" 23776 "vmovdqu $dst, $tmp2\t! load shuffle (load 8L for 8LRearrange)" %} 23777 ins_encode %{ 23778 int vector_len = 2; 23779 __ vpmovsxbd($tmp2$$XMMRegister, $src$$XMMRegister, 1); 23780 __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sizemask()), $scratch$$Register); 23781 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 1); 23782 __ vpmovsxdq($tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len); 23783 __ vpsllq($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x20, vector_len); 23784 __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len); 23785 __ evmovdqul($tmp$$XMMRegister, k1, ExternalAddress(vector_long_shufflemask()), false, vector_len, $scratch$$Register); 23786 __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len); 23787 __ evmovdqul($dst$$XMMRegister, $tmp2$$XMMRegister, vector_len); 23788 %} 23789 ins_pipe( pipe_slow ); 23790 %} 23791 //-------------------------------- Rearrange ------------------------------------- 23792 23793 instruct rearrange8b(vecD dst, vecD shuffle) %{ 23794 predicate(UseSSE > 2 && n->as_Vector()->length() == 8 && 23795 n->bottom_type()->is_vect()->element_basic_type() == T_BYTE); 23796 match(Set dst (VectorRearrange dst shuffle)); 23797 effect(TEMP dst); 23798 format %{ "pshufb $dst, $shuffle\t! rerrrange (8BRearrange)" %} 23799 ins_encode %{ 23800 __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister); 23801 %} 23802 ins_pipe( pipe_slow ); 23803 %} 23804 23805 instruct rearrange16b(vecX dst, vecX shuffle) %{ 23806 predicate(UseSSE > 2 && n->as_Vector()->length() == 16 && 23807 n->bottom_type()->is_vect()->element_basic_type() == T_BYTE); 23808 match(Set dst (VectorRearrange dst shuffle)); 23809 effect(TEMP dst); 23810 format %{ "pshufb $dst, $shuffle\t! rearrange (16BRearrange)" %} 23811 ins_encode %{ 23812 __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister); 23813 %} 23814 ins_pipe( pipe_slow ); 23815 %} 23816 23817 instruct rearrange32b(vecY dst, vecY src, vecY shuffle) %{ 23818 predicate(UseAVX > 2 && VM_Version::supports_avx512vbmi() && n->as_Vector()->length() == 32 && 23819 n->bottom_type()->is_vect()->element_basic_type() == T_BYTE); 23820 match(Set dst (VectorRearrange src shuffle)); 23821 effect(TEMP dst); 23822 format %{ "vpermb $dst, $shuffle\t! rearrange (32BRearrange)" %} 23823 ins_encode %{ 23824 int vector_len = 1; 23825 __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vector_len); 23826 %} 23827 ins_pipe( pipe_slow ); 23828 %} 23829 23830 instruct rearrange64b(vecZ dst, vecZ src, vecZ shuffle) %{ 23831 predicate(UseAVX > 2 && VM_Version::supports_avx512vbmi() && n->as_Vector()->length() == 64 && 23832 n->bottom_type()->is_vect()->element_basic_type() == T_BYTE); 23833 match(Set dst (VectorRearrange src shuffle)); 23834 effect(TEMP dst); 23835 format %{ "vpermb $dst, $shuffle\t! rearrange (64BRearrange)" %} 23836 ins_encode %{ 23837 int vector_len = 2; 23838 __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vector_len); 23839 %} 23840 ins_pipe( pipe_slow ); 23841 %} 23842 23843 instruct rearrange4s(vecD dst, vecD shuffle) %{ 23844 predicate(UseSSE > 2 && n->as_Vector()->length() == 4 && 23845 n->bottom_type()->is_vect()->element_basic_type() == T_SHORT); 23846 match(Set dst (VectorRearrange dst shuffle)); 23847 effect(TEMP dst); 23848 format %{ "pshufb $dst, $shuffle\t! rerrrange (4SRearrange)" %} 23849 ins_encode %{ 23850 __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister); 23851 %} 23852 ins_pipe( pipe_slow ); 23853 %} 23854 23855 instruct rearrange8s(vecX dst, vecX shuffle) %{ 23856 predicate(UseSSE > 2 && n->as_Vector()->length() == 8 && 23857 n->bottom_type()->is_vect()->element_basic_type() == T_SHORT); 23858 match(Set dst (VectorRearrange dst shuffle)); 23859 effect(TEMP dst); 23860 format %{ "pshufb $dst, $shuffle\t! rearrange (8SRearrange)" %} 23861 ins_encode %{ 23862 __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister); 23863 %} 23864 ins_pipe( pipe_slow ); 23865 %} 23866 23867 instruct rearrange16s(vecY dst, vecY src, vecY shuffle) %{ 23868 predicate(UseAVX > 2 && VM_Version::supports_avx512vlbw() && n->as_Vector()->length() == 16 && 23869 n->bottom_type()->is_vect()->element_basic_type() == T_SHORT); 23870 match(Set dst (VectorRearrange src shuffle)); 23871 effect(TEMP dst); 23872 format %{ "vpermw $dst, $shuffle\t! rearrange (16SRearrange)" %} 23873 ins_encode %{ 23874 int vector_len = 1; 23875 __ vpermw($dst$$XMMRegister, k0, $shuffle$$XMMRegister, $src$$XMMRegister, false,vector_len); 23876 %} 23877 ins_pipe( pipe_slow ); 23878 %} 23879 23880 instruct rearrange32s(vecZ dst, vecZ src, vecZ shuffle) %{ 23881 predicate(UseAVX > 2 && VM_Version::supports_avx512vlbw() && n->as_Vector()->length() == 32 && 23882 n->bottom_type()->is_vect()->element_basic_type() == T_SHORT); 23883 match(Set dst (VectorRearrange src shuffle)); 23884 effect(TEMP dst); 23885 format %{ "vpermw $dst, $shuffle\t! rearrange (32SRearrange)" %} 23886 ins_encode %{ 23887 int vector_len = 2; 23888 __ __ vpermw($dst$$XMMRegister, k0, $shuffle$$XMMRegister, $src$$XMMRegister, false,vector_len); 23889 %} 23890 ins_pipe( pipe_slow ); 23891 %} 23892 23893 instruct rearrange4i(vecX dst, vecX shuffle) %{ 23894 predicate(UseSSE > 2 && n->as_Vector()->length() == 4 && 23895 (n->bottom_type()->is_vect()->element_basic_type() == T_INT || 23896 n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT)); 23897 match(Set dst (VectorRearrange dst shuffle)); 23898 effect(TEMP dst); 23899 format %{ "pshufb $dst, $shuffle\t! rearrange (4IRearrange)" %} 23900 ins_encode %{ 23901 __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister); 23902 %} 23903 ins_pipe( pipe_slow ); 23904 %} 23905 23906 instruct rearrange8i(vecY dst, vecY src, vecY shuffle) %{ 23907 predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && 23908 (n->bottom_type()->is_vect()->element_basic_type() == T_INT || 23909 n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT)); 23910 match(Set dst (VectorRearrange src shuffle)); 23911 effect(TEMP dst); 23912 format %{ "vpermd $dst, $src, $shuffle\t! rearrange (8IRearrange)" %} 23913 ins_encode %{ 23914 int vector_len = 1; 23915 __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister); 23916 %} 23917 ins_pipe( pipe_slow ); 23918 %} 23919 23920 instruct rearrange16i(vecZ dst, vecZ src, vecZ shuffle) %{ 23921 predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && 23922 (n->bottom_type()->is_vect()->element_basic_type() == T_INT || 23923 n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT)); 23924 match(Set dst (VectorRearrange src shuffle)); 23925 effect(TEMP dst); 23926 format %{ "vpermd $dst, $src, $shuffle\t! rearrange (16IRearrange)" %} 23927 ins_encode %{ 23928 int vector_len = 2; 23929 __ evpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vector_len); 23930 %} 23931 ins_pipe( pipe_slow ); 23932 %} 23933 23934 instruct rearrange4l(vecY dst, vecY src, vecY shuffle) %{ 23935 predicate(UseAVX > 1 && n->as_Vector()->length() == 4 && 23936 (n->bottom_type()->is_vect()->element_basic_type() == T_LONG || 23937 n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE)); 23938 match(Set dst (VectorRearrange src shuffle)); 23939 effect(TEMP dst); 23940 format %{ "vpermd $dst, $src, $shuffle\t! rearrange (4LRearrange)" %} 23941 ins_encode %{ 23942 int vector_len = 1; 23943 __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister); 23944 %} 23945 ins_pipe( pipe_slow ); 23946 %} 23947 23948 instruct rearrange8l(vecZ dst, vecZ src, vecZ shuffle) %{ 23949 predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && 23950 (n->bottom_type()->is_vect()->element_basic_type() == T_LONG || 23951 n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE)); 23952 match(Set dst (VectorRearrange src shuffle)); 23953 effect(TEMP dst); 23954 format %{ "vpermd $dst, $src, $shuffle\t! rearrange (8LRearrange)" %} 23955 ins_encode %{ 23956 int vector_len = 2; 23957 __ evpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vector_len); 23958 %} 23959 ins_pipe( pipe_slow ); 23960 %} 23961 // --------------------------------- FMA -------------------------------------- 23962 23963 // a * b + c 23964 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{ 23965 predicate(UseFMA && n->as_Vector()->length() == 2); 23966 match(Set c (FmaVD c (Binary a b))); 23967 format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %} 23968 ins_cost(150); 23969 ins_encode %{ 23970 int vector_len = 0; 23971 __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len); 23972 %} 23973 ins_pipe( pipe_slow ); 23974 %} 23975 23976 // a * b + c 23977 instruct vfma2D_mem(vecX a, memory b, vecX c) %{ 23978 predicate(UseFMA && n->as_Vector()->length() == 2); 23979 match(Set c (FmaVD c (Binary a (LoadVector b)))); 23980 format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %} |