< prev index next >

src/hotspot/cpu/x86/x86.ad

Print this page

        

@@ -1347,10 +1347,16 @@
   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
   static address vector_all_ones_mask() { return StubRoutines::x86::vector_all_ones_mask(); }
+  static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
+  static address vector_int_sizemask() { return StubRoutines::x86::vector_int_size_mask(); }
+  static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
+  static address vector_short_sizemask() { return StubRoutines::x86::vector_short_size_mask(); }
+  static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
+  static address vector_long_sizemask() { return StubRoutines::x86::vector_long_size_mask(); }
 #else
   static address float_signmask()  { return (address)float_signmask_pool; }
   static address float_signflip()  { return (address)float_signflip_pool; }
   static address double_signmask() { return (address)double_signmask_pool; }
   static address double_signflip() { return (address)double_signflip_pool; }

@@ -1526,10 +1532,17 @@
         case Op_VectorLoadMask:
           if (UseSSE <= 3) { ret_value = false; }
           else if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation
           else if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; } // Implementation limitation
           break;
+        case Op_VectorLoadShuffle:
+        case Op_VectorRearrange:
+          if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation due to how shuffle is loaded
+          else if (size_in_bits >= 256 && UseAVX < 2) { ret_value = false; } // Implementation limitation
+          else if (bt == T_BYTE && size_in_bits >= 256 && !VM_Version::supports_avx512vbmi())  { ret_value = false; } // Implementation limitation
+          else if (bt == T_SHORT && size_in_bits >= 256 && !VM_Version::supports_avx512vlbw())  { ret_value = false; } // Implementation limitation
+          break;
         case Op_VectorStoreMask:
           if (UseAVX < 0) { ret_value = false; } // Implementation limitation
           else if ((size_in_bits >= 256 || bt == T_LONG || bt == T_DOUBLE) && UseAVX < 2) { ret_value = false; } // Implementation limitation
           else if (vlen == 1 || vlen == 2) { ret_value = false; } // Implementation limitation
           else if (size_in_bits == 512 && !VM_Version::supports_avx512bw()) { ret_value = false; } // Implementation limitation

@@ -23533,10 +23546,420 @@
     __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_byte_bitset()), false, 0, $scratch$$Register);
   %}
   ins_pipe( pipe_slow );
 %}
 
+//-------------------------------- LOAD_SHUFFLE ----------------------------------
+
+instruct loadshuffle8b(vecD dst, vecD src) %{
+  predicate(UseSSE > 1  && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
+  match(Set dst (VectorLoadShuffle src));
+  format %{ "movdqu $dst, $src\t! load shuffle (load 8B for 8BRearrange)" %}
+  ins_encode %{
+     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct loadshuffle16b(vecX dst, vecX src) %{
+  predicate(UseSSE > 1 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
+  match(Set dst (VectorLoadShuffle src));
+  format %{ "movdqu $dst, $src\t! load shuffle (load 16B for 16BRearrange)" %}
+  ins_encode %{
+    __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct loadshuffle32b(vecY dst, vecY src) %{
+  predicate(UseAVX > 0 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
+  match(Set dst (VectorLoadShuffle src));
+  format %{ "vmovdqu $dst, $src\t! load shuffle (load 32B for 32BRearrange)" %}
+  ins_encode %{
+    __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct loadshuffle64b(vecZ dst, vecZ src) %{
+  predicate(UseAVX > 2  && n->as_Vector()->length() == 64 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
+  match(Set dst (VectorLoadShuffle src));
+  format %{ "vmovdqu $dst, $src\t! load shuffle (load 64B for 64BRearrange)" %}
+  ins_encode %{
+    __ evmovdqul($dst$$XMMRegister, $src$$XMMRegister, 2);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct loadshuffle4s(vecD dst, vecS src, vecD tmp, vecD tmp2, rRegI scratch) %{
+  predicate(UseSSE > 3 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
+  match(Set dst (VectorLoadShuffle src));
+  effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP scratch);
+   format %{ "pmovsxbw    $tmp, $src \n\t"
+             "movdqu      $tmp2,0x0002000200020002\n\t"
+             "pmullw      $tmp,$tmp2\n\t"
+             "movdqu      $tmp2,$tmp\n\t"
+             "psllw       $tmp2,0x8\n\t"
+             "paddb       $tmp2,$tmp\n\t"
+             "movdqu      $tmp, 0x0100010001000100 \n\t"
+             "paddb       $tmp2,$tmp\n\t"
+             "movdqu      $dst, $tmp2\t! load shuffle (load 4B for 4SRearrange)" %}
+  ins_encode %{
+    __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister);
+    __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_sizemask()), $scratch$$Register);
+    __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ movdqu($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ psllw($tmp2$$XMMRegister, 0x8);
+    __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
+    __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ movdqu($dst$$XMMRegister, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct loadshuffle8s(vecX dst, vecD src, vecX tmp, vecX tmp2, rRegI scratch) %{
+  predicate(UseSSE > 3 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
+  match(Set dst (VectorLoadShuffle src));
+  effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP scratch);
+  format %{  "pmovsxbw    $tmp, $src \n\t"
+             "movdqu      $tmp2,0x0002000200020002\n\t"
+             "pmullw      $tmp,$tmp2\n\t"
+             "movdqu      $tmp2,$tmp\n\t"
+             "psllw       $tmp2,0x8\n\t"
+             "paddb       $tmp2,$tmp\n\t"
+             "movdqu      $tmp, 0x0100010001000100 \n\t"
+             "paddb       $tmp2,$tmp\n\t"
+             "movdqu      $dst, $tmp2\t! load shuffle (load 8B for 8SRearrange)" %}
+  ins_encode %{
+    __ pmovsxbw($tmp$$XMMRegister, $src$$XMMRegister);
+    __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_short_sizemask()), $scratch$$Register);
+    __ pmullw($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ movdqu($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ psllw($tmp2$$XMMRegister, 0x8);
+    __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
+    __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ movdqu($dst$$XMMRegister, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct loadshuffle16s(vecY dst, vecX src) %{
+  predicate(UseAVX >= 2 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
+  match(Set dst (VectorLoadShuffle src));
+  format %{ "vpmovsxbw   $dst,$src\t! load shuffle (load 16B for 16SRearrange)" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct loadshuffle32s(vecZ dst, vecY src) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
+  match(Set dst (VectorLoadShuffle src));
+  format %{ "vpmovsxbw   $dst,$src\t! load shuffle (load 32B for 32SRearrange)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct loadshuffle4i(vecX dst, vecS src, vecX tmp, vecX tmp2, rRegI scratch) %{
+  predicate(UseSSE > 3 && n->as_Vector()->length() == 4 &&
+            (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
+             n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT));
+  match(Set dst (VectorLoadShuffle src));
+  effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP scratch);
+  format %{ "vpmovsxbd   $tmp, $src \n\t"
+            "movdqu      $tmp2, 0x0000000400000004 \n\t"
+            "pmulld      $tmp2, $tmp \n\t"
+            "movdqu      $tmp,$tmp2\n\t"
+            "pslld       $tmp2,0x8\n\t"
+            "paddb       $tmp2,$tmp\n\t"
+            "pslld       $tmp2,0x8\n\t"
+            "paddb       $tmp2,$tmp\n\t"
+            "pslld       $tmp2,0x8\n\t"
+            "paddb       $tmp2,$tmp\n\t"
+            "movdqu      $tmp, 0x0302010003020100 \n\t"
+            "paddb       $tmp2,$tmp\n\t"
+            "movdqu      $dst, $tmp2\t! load shuffle (load 4B for 4IRearrange)" %}
+  ins_encode %{
+    __ vpmovsxbd($tmp$$XMMRegister, $src$$XMMRegister, 0);
+    __ movdqu($tmp2$$XMMRegister, ExternalAddress(vector_int_sizemask()), $scratch$$Register);
+    __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ movdqu($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ pslld($tmp2$$XMMRegister, 0x8);
+    __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pslld($tmp2$$XMMRegister, 0x8);
+    __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pslld($tmp2$$XMMRegister, 0x8);
+    __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_int_shufflemask()), $scratch$$Register);
+    __ paddb($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ movdqu($dst$$XMMRegister, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct loadshuffle8i(vecY dst, vecD src) %{
+  predicate(UseAVX >= 1 && n->as_Vector()->length() == 8 &&
+            (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
+             n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT));
+  match(Set dst (VectorLoadShuffle src));
+  format %{ "vpmovsxbd $dst, $src\t! load shuffle (load 8B for 8IRearrange)" %}
+  ins_encode %{
+  int vector_len = 1;
+    __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct loadshuffle16i(vecZ dst, vecX src) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16 &&
+            (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
+             n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT));
+  match(Set dst (VectorLoadShuffle src));
+  format %{ "vpmovsxbd $dst, $src\t! load shuffle (load 16B for 16IRearrange)" %}
+  ins_encode %{
+  int vector_len = 2;
+    __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct loadshuffle4l(vecY dst, vecS src, vecY tmp, vecY tmp2, rRegI scratch) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
+            (n->bottom_type()->is_vect()->element_basic_type() == T_LONG ||
+             n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE));
+  match(Set dst (VectorLoadShuffle src));
+  effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP scratch);
+   format %{ "vpmovsxbd   $tmp2, $src \n\t"
+             "movdqu     $tmp, 0x0000000200000002 \n\t"
+             "pmulld     $tmp, $tmp2 \n\t"
+             "vpmovsxdq  $tmp2,$tmp\n\t"
+             "vpsllq     $tmp2,0x20\n\t"
+             "vpaddd     $tmp2,$tmp\n\t"
+             "vmovdqu    $tmp, 0x0000000100000000 \n\t"
+             "vpaddd     $tmp2,$tmp\n\t"
+             "vmovdqu    $dst, $tmp2\t! load shuffle (load 4L for 4LRearrange)" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vpmovsxbd($tmp2$$XMMRegister, $src$$XMMRegister, 0);
+    __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sizemask()), $scratch$$Register);
+    __ pmulld($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ vpmovsxdq($tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
+    __ vpsllq($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x20, vector_len);
+    __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
+    __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_shufflemask()), $scratch$$Register);
+    __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
+    __ vmovdqu($dst$$XMMRegister, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct loadshuffle8l(vecZ dst, vecD src, vecZ tmp, vecZ tmp2, rRegI scratch) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
+            (n->bottom_type()->is_vect()->element_basic_type() == T_LONG ||
+             n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE));
+  match(Set dst (VectorLoadShuffle src));
+  effect(TEMP dst, TEMP tmp, TEMP tmp2, TEMP scratch);
+  format %{ "vpmovsxbd  $tmp2, $src \n\t"
+            "movdqu     $tmp, 0x0000000200000002 \n\t"
+            "pmulld     $tmp, $tmp2\n\t"
+            "vpmovsxdq  $tmp2,$tmp\n\t"
+            "vpsllq     $tmp2,0x20\n\t"
+            "vpaddd     $tmp2,$tmp\n\t"
+            "vmovdqu    $tmp, 0x0000000100000000 \n\t"
+            "vpaddd     $tmp2,$tmp\n\t"
+            "vmovdqu    $dst, $tmp2\t! load shuffle (load 8L for 8LRearrange)" %}
+  ins_encode %{
+  int vector_len = 2;
+    __ vpmovsxbd($tmp2$$XMMRegister, $src$$XMMRegister, 1);
+    __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sizemask()), $scratch$$Register);
+    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 1);
+    __ vpmovsxdq($tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
+    __ vpsllq($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x20, vector_len);
+    __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
+    __ evmovdqul($tmp$$XMMRegister, k1, ExternalAddress(vector_long_shufflemask()), false, vector_len, $scratch$$Register);
+    __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
+    __ evmovdqul($dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+//-------------------------------- Rearrange -------------------------------------
+
+instruct rearrange8b(vecD dst, vecD shuffle) %{
+  predicate(UseSSE > 2 && n->as_Vector()->length() == 8 &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
+  match(Set dst (VectorRearrange dst shuffle));
+  effect(TEMP dst);
+  format %{ "pshufb $dst, $shuffle\t! rerrrange (8BRearrange)" %}
+  ins_encode %{
+    __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rearrange16b(vecX dst, vecX shuffle) %{
+  predicate(UseSSE > 2 && n->as_Vector()->length() == 16 &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
+  match(Set dst (VectorRearrange dst shuffle));
+  effect(TEMP dst);
+  format %{ "pshufb $dst, $shuffle\t! rearrange (16BRearrange)" %}
+  ins_encode %{
+    __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rearrange32b(vecY dst, vecY src, vecY shuffle) %{
+  predicate(UseAVX > 2 && VM_Version::supports_avx512vbmi() && n->as_Vector()->length() == 32 &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
+  match(Set dst (VectorRearrange src shuffle));
+  effect(TEMP dst);
+  format %{ "vpermb $dst, $shuffle\t! rearrange (32BRearrange)" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rearrange64b(vecZ dst, vecZ src, vecZ shuffle) %{
+  predicate(UseAVX > 2 && VM_Version::supports_avx512vbmi() && n->as_Vector()->length() == 64 &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
+  match(Set dst (VectorRearrange src shuffle));
+  effect(TEMP dst);
+  format %{ "vpermb $dst, $shuffle\t! rearrange (64BRearrange)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rearrange4s(vecD dst, vecD shuffle) %{
+  predicate(UseSSE > 2 && n->as_Vector()->length() == 4 &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
+  match(Set dst (VectorRearrange dst shuffle));
+  effect(TEMP dst);
+  format %{ "pshufb $dst, $shuffle\t! rerrrange (4SRearrange)" %}
+  ins_encode %{
+    __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rearrange8s(vecX dst, vecX shuffle) %{
+  predicate(UseSSE > 2 && n->as_Vector()->length() == 8 &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
+  match(Set dst (VectorRearrange dst shuffle));
+  effect(TEMP dst);
+  format %{ "pshufb $dst, $shuffle\t! rearrange (8SRearrange)" %}
+  ins_encode %{
+    __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rearrange16s(vecY dst, vecY src, vecY shuffle) %{
+  predicate(UseAVX > 2 && VM_Version::supports_avx512vlbw() && n->as_Vector()->length() == 16 &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
+  match(Set dst (VectorRearrange src shuffle));
+  effect(TEMP dst);
+  format %{ "vpermw $dst, $shuffle\t! rearrange (16SRearrange)" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vpermw($dst$$XMMRegister, k0, $shuffle$$XMMRegister, $src$$XMMRegister, false,vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rearrange32s(vecZ dst, vecZ src, vecZ shuffle) %{
+  predicate(UseAVX > 2 && VM_Version::supports_avx512vlbw() && n->as_Vector()->length() == 32 &&
+            n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
+  match(Set dst (VectorRearrange src shuffle));
+  effect(TEMP dst);
+  format %{ "vpermw $dst, $shuffle\t! rearrange (32SRearrange)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ __ vpermw($dst$$XMMRegister, k0, $shuffle$$XMMRegister, $src$$XMMRegister, false,vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rearrange4i(vecX dst, vecX shuffle) %{
+ predicate(UseSSE > 2 && n->as_Vector()->length() == 4 &&
+            (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
+             n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT));
+  match(Set dst (VectorRearrange dst shuffle));
+  effect(TEMP dst);
+  format %{ "pshufb $dst, $shuffle\t! rearrange (4IRearrange)" %}
+  ins_encode %{
+    __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rearrange8i(vecY dst, vecY src, vecY shuffle) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 8 && 
+            (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
+             n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT));
+  match(Set dst (VectorRearrange src shuffle));
+  effect(TEMP dst);
+  format %{ "vpermd $dst, $src, $shuffle\t! rearrange (8IRearrange)" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rearrange16i(vecZ dst, vecZ src, vecZ shuffle) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 16 && 
+            (n->bottom_type()->is_vect()->element_basic_type() == T_INT ||
+             n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT));
+  match(Set dst (VectorRearrange src shuffle));
+  effect(TEMP dst);
+  format %{ "vpermd $dst, $src, $shuffle\t! rearrange (16IRearrange)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rearrange4l(vecY dst, vecY src, vecY shuffle) %{
+  predicate(UseAVX > 1 && n->as_Vector()->length() == 4 &&
+            (n->bottom_type()->is_vect()->element_basic_type() == T_LONG ||
+             n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE));
+  match(Set dst (VectorRearrange src shuffle));
+  effect(TEMP dst);
+  format %{ "vpermd $dst, $src, $shuffle\t! rearrange (4LRearrange)" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rearrange8l(vecZ dst, vecZ src, vecZ shuffle) %{
+  predicate(UseAVX > 2 && n->as_Vector()->length() == 8 &&
+            (n->bottom_type()->is_vect()->element_basic_type() == T_LONG ||
+             n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE));
+  match(Set dst (VectorRearrange src shuffle));
+  effect(TEMP dst);
+  format %{ "vpermd $dst, $src, $shuffle\t! rearrange (8LRearrange)" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ evpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
 // --------------------------------- FMA --------------------------------------
 
 // a * b + c
 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
   predicate(UseFMA && n->as_Vector()->length() == 2);
< prev index next >