1 //
   2 // Copyright (c) 2011, 2017, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 
 733 // Class for pre evex double registers
 734 reg_class double_reg_legacy(XMM0,  XMM0b,
 735                      XMM1,  XMM1b,
 736                      XMM2,  XMM2b,
 737                      XMM3,  XMM3b,
 738                      XMM4,  XMM4b,
 739                      XMM5,  XMM5b,
 740                      XMM6,  XMM6b,
 741                      XMM7,  XMM7b
 742 #ifdef _LP64
 743                     ,XMM8,  XMM8b,
 744                      XMM9,  XMM9b,
 745                      XMM10, XMM10b,
 746                      XMM11, XMM11b,
 747                      XMM12, XMM12b,
 748                      XMM13, XMM13b,
 749                      XMM14, XMM14b,
 750                      XMM15, XMM15b
 751 #endif
 752                      );
 753 
 754 // Class for evex double registers
 755 reg_class double_reg_evex(XMM0,  XMM0b,
 756                      XMM1,  XMM1b,
 757                      XMM2,  XMM2b,
 758                      XMM3,  XMM3b,
 759                      XMM4,  XMM4b,
 760                      XMM5,  XMM5b,
 761                      XMM6,  XMM6b,
 762                      XMM7,  XMM7b
 763 #ifdef _LP64
 764                     ,XMM8,  XMM8b,
 765                      XMM9,  XMM9b,
 766                      XMM10, XMM10b,
 767                      XMM11, XMM11b,
 768                      XMM12, XMM12b,
 769                      XMM13, XMM13b,
 770                      XMM14, XMM14b,
 771                      XMM15, XMM15b,
 772                      XMM16, XMM16b,
 773                      XMM17, XMM17b,
 774                      XMM18, XMM18b,
 775                      XMM19, XMM19b,
 776                      XMM20, XMM20b,
 777                      XMM21, XMM21b,
 778                      XMM22, XMM22b,
 779                      XMM23, XMM23b,
 780                      XMM24, XMM24b,
 781                      XMM25, XMM25b,
 782                      XMM26, XMM26b,
 783                      XMM27, XMM27b,
 784                      XMM28, XMM28b,
 785                      XMM29, XMM29b,
 786                      XMM30, XMM30b,
 787                      XMM31, XMM31b
 788 #endif
 789                      );
 790 
 791 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 792 
 793 // Class for pre evex 32bit vector registers
 794 reg_class vectors_reg_legacy(XMM0,
 795                       XMM1,
 796                       XMM2,
 797                       XMM3,
 798                       XMM4,
 799                       XMM5,
 800                       XMM6,
 801                       XMM7
 802 #ifdef _LP64
 803                      ,XMM8,
 804                       XMM9,
 805                       XMM10,
 806                       XMM11,
 807                       XMM12,
 808                       XMM13,
 809                       XMM14,
 810                       XMM15
 811 #endif
 812                       );
 813 
 814 // Class for evex 32bit vector registers
 815 reg_class vectors_reg_evex(XMM0,
 816                       XMM1,
 817                       XMM2,
 818                       XMM3,
 819                       XMM4,
 820                       XMM5,
 821                       XMM6,
 822                       XMM7
 823 #ifdef _LP64
 824                      ,XMM8,
 825                       XMM9,
 826                       XMM10,
 827                       XMM11,
 828                       XMM12,
 829                       XMM13,
 830                       XMM14,
 831                       XMM15,
 832                       XMM16,
 833                       XMM17,
 834                       XMM18,
 835                       XMM19,
 836                       XMM20,
 837                       XMM21,
 838                       XMM22,
 839                       XMM23,
 840                       XMM24,
 841                       XMM25,
 842                       XMM26,
 843                       XMM27,
 844                       XMM28,
 845                       XMM29,
 846                       XMM30,
 847                       XMM31
 848 #endif
 849                       );
 850 
 851 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 852 
 853 // Class for all 64bit vector registers
 854 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 855                       XMM1,  XMM1b,
 856                       XMM2,  XMM2b,
 857                       XMM3,  XMM3b,
 858                       XMM4,  XMM4b,
 859                       XMM5,  XMM5b,
 860                       XMM6,  XMM6b,
 861                       XMM7,  XMM7b
 862 #ifdef _LP64
 863                      ,XMM8,  XMM8b,
 864                       XMM9,  XMM9b,
 865                       XMM10, XMM10b,
 866                       XMM11, XMM11b,
 867                       XMM12, XMM12b,
 868                       XMM13, XMM13b,
 869                       XMM14, XMM14b,
 870                       XMM15, XMM15b
 871 #endif
 872                       );
 873 
 874 // Class for all 64bit vector registers
 875 reg_class vectord_reg_evex(XMM0,  XMM0b,
 876                       XMM1,  XMM1b,
 877                       XMM2,  XMM2b,
 878                       XMM3,  XMM3b,
 879                       XMM4,  XMM4b,
 880                       XMM5,  XMM5b,
 881                       XMM6,  XMM6b,
 882                       XMM7,  XMM7b
 883 #ifdef _LP64
 884                      ,XMM8,  XMM8b,
 885                       XMM9,  XMM9b,
 886                       XMM10, XMM10b,
 887                       XMM11, XMM11b,
 888                       XMM12, XMM12b,
 889                       XMM13, XMM13b,
 890                       XMM14, XMM14b,
 891                       XMM15, XMM15b,
 892                       XMM16, XMM16b,
 893                       XMM17, XMM17b,
 894                       XMM18, XMM18b,
 895                       XMM19, XMM19b,
 896                       XMM20, XMM20b,
 897                       XMM21, XMM21b,
 898                       XMM22, XMM22b,
 899                       XMM23, XMM23b,
 900                       XMM24, XMM24b,
 901                       XMM25, XMM25b,
 902                       XMM26, XMM26b,
 903                       XMM27, XMM27b,
 904                       XMM28, XMM28b,
 905                       XMM29, XMM29b,
 906                       XMM30, XMM30b,
 907                       XMM31, XMM31b
 908 #endif
 909                       );
 910 
 911 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 912 
 913 // Class for all 128bit vector registers
 914 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 915                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 916                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 917                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 918                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 919                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 920                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 921                       XMM7,  XMM7b,  XMM7c,  XMM7d
 922 #ifdef _LP64
 923                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 924                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 925                       XMM10, XMM10b, XMM10c, XMM10d,
 926                       XMM11, XMM11b, XMM11c, XMM11d,
 927                       XMM12, XMM12b, XMM12c, XMM12d,
 928                       XMM13, XMM13b, XMM13c, XMM13d,
 929                       XMM14, XMM14b, XMM14c, XMM14d,
 930                       XMM15, XMM15b, XMM15c, XMM15d
 931 #endif
 932                       );
 933 
 934 // Class for all 128bit vector registers
 935 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 936                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 937                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 938                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 939                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 940                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 941                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 942                       XMM7,  XMM7b,  XMM7c,  XMM7d
 943 #ifdef _LP64
 944                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 945                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 946                       XMM10, XMM10b, XMM10c, XMM10d,
 947                       XMM11, XMM11b, XMM11c, XMM11d,
 948                       XMM12, XMM12b, XMM12c, XMM12d,
 949                       XMM13, XMM13b, XMM13c, XMM13d,
 950                       XMM14, XMM14b, XMM14c, XMM14d,
 951                       XMM15, XMM15b, XMM15c, XMM15d,
 952                       XMM16, XMM16b, XMM16c, XMM16d,
 953                       XMM17, XMM17b, XMM17c, XMM17d,
 954                       XMM18, XMM18b, XMM18c, XMM18d,
 955                       XMM19, XMM19b, XMM19c, XMM19d,
 956                       XMM20, XMM20b, XMM20c, XMM20d,
 957                       XMM21, XMM21b, XMM21c, XMM21d,
 958                       XMM22, XMM22b, XMM22c, XMM22d,
 959                       XMM23, XMM23b, XMM23c, XMM23d,
 960                       XMM24, XMM24b, XMM24c, XMM24d,
 961                       XMM25, XMM25b, XMM25c, XMM25d,
 962                       XMM26, XMM26b, XMM26c, XMM26d,
 963                       XMM27, XMM27b, XMM27c, XMM27d,
 964                       XMM28, XMM28b, XMM28c, XMM28d,
 965                       XMM29, XMM29b, XMM29c, XMM29d,
 966                       XMM30, XMM30b, XMM30c, XMM30d,
 967                       XMM31, XMM31b, XMM31c, XMM31d
 968 #endif
 969                       );
 970 
 971 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 972 
 973 // Class for all 256bit vector registers
 974 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 975                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 976                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 977                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 978                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 979                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 980                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 981                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 982 #ifdef _LP64
 983                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 984                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 985                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 986                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 987                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 988                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 989                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 990                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 991 #endif
 992                       );
 993 
 994 // Class for all 256bit vector registers
 995 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 996                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 997                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 998                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 999                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1000                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1001                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1002                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1003 #ifdef _LP64
1004                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1005                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1006                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1007                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1008                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1009                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1010                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1011                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1012                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1013                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1014                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1015                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1016                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1017                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1018                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1019                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1020                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1021                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1022                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1023                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1024                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1025                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1026                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1027                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1028 #endif
1029                       );
1030 
1031 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1032 
1033 // Class for all 512bit vector registers
1034 reg_class vectorz_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1035                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1036                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1037                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1038                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1039                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1040                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1041                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1042 #ifdef _LP64
1043                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1044                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1045                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1046                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1047                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1048                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1049                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1050                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1051                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1052                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1053                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1054                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1055                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1056                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1057                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1058                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1059                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1060                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1061                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1062                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1063                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1064                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1065                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1066                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1067 #endif
1068                       );
1069 
1070 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1071 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
1072 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
1073 
1074 reg_class xmm1_reg(XMM1, XMM1b, XMM1c, XMM1d);
1075 reg_class ymm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h);
1076 reg_class zmm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p);
1077 
1078 reg_class xmm2_reg(XMM2, XMM2b, XMM2c, XMM2d);
1079 reg_class ymm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h);
1080 reg_class zmm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p);
1081 
1082 reg_class xmm3_reg(XMM3, XMM3b, XMM3c, XMM3d);
1083 reg_class ymm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h);
1084 reg_class zmm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p);
1085 
1086 reg_class xmm4_reg(XMM4, XMM4b, XMM4c, XMM4d);
1087 reg_class ymm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h);
1088 reg_class zmm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p);
1089 
1090 reg_class xmm5_reg(XMM5, XMM5b, XMM5c, XMM5d);
1091 reg_class ymm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h);
1092 reg_class zmm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p);
1093 
1094 reg_class xmm6_reg(XMM6, XMM6b, XMM6c, XMM6d);
1095 reg_class ymm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h);
1096 reg_class zmm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p);
1097 
1098 reg_class xmm7_reg(XMM7, XMM7b, XMM7c, XMM7d);
1099 reg_class ymm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h);
1100 reg_class zmm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p);
1101 
1102 #ifdef _LP64
1103 
1104 reg_class xmm8_reg(XMM8, XMM8b, XMM8c, XMM8d);
1105 reg_class ymm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h);
1106 reg_class zmm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p);
1107 
1108 reg_class xmm9_reg(XMM9, XMM9b, XMM9c, XMM9d);
1109 reg_class ymm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h);
1110 reg_class zmm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p);
1111 
1112 reg_class xmm10_reg(XMM10, XMM10b, XMM10c, XMM10d);
1113 reg_class ymm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h);
1114 reg_class zmm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p);
1115 
1116 reg_class xmm11_reg(XMM11, XMM11b, XMM11c, XMM11d);
1117 reg_class ymm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h);
1118 reg_class zmm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p);
1119 
1120 reg_class xmm12_reg(XMM12, XMM12b, XMM12c, XMM12d);
1121 reg_class ymm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h);
1122 reg_class zmm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p);
1123 
1124 reg_class xmm13_reg(XMM13, XMM13b, XMM13c, XMM13d);
1125 reg_class ymm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h);
1126 reg_class zmm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p);
1127 
1128 reg_class xmm14_reg(XMM14, XMM14b, XMM14c, XMM14d);
1129 reg_class ymm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h);
1130 reg_class zmm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p);
1131 
1132 reg_class xmm15_reg(XMM15, XMM15b, XMM15c, XMM15d);
1133 reg_class ymm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
1134 reg_class zmm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
1135 
1136 reg_class xmm16_reg(XMM16, XMM16b, XMM16c, XMM16d);
1137 reg_class ymm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h);
1138 reg_class zmm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p);
1139 
1140 reg_class xmm17_reg(XMM17, XMM17b, XMM17c, XMM17d);
1141 reg_class ymm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h);
1142 reg_class zmm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p);
1143 
1144 reg_class xmm18_reg(XMM18, XMM18b, XMM18c, XMM18d);
1145 reg_class ymm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h);
1146 reg_class zmm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p);
1147 
1148 reg_class xmm19_reg(XMM19, XMM19b, XMM19c, XMM19d);
1149 reg_class ymm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h);
1150 reg_class zmm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p);
1151 
1152 reg_class xmm20_reg(XMM20, XMM20b, XMM20c, XMM20d);
1153 reg_class ymm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h);
1154 reg_class zmm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p);
1155 
1156 reg_class xmm21_reg(XMM21, XMM21b, XMM21c, XMM21d);
1157 reg_class ymm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h);
1158 reg_class zmm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p);
1159 
1160 reg_class xmm22_reg(XMM22, XMM22b, XMM22c, XMM22d);
1161 reg_class ymm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h);
1162 reg_class zmm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p);
1163 
1164 reg_class xmm23_reg(XMM23, XMM23b, XMM23c, XMM23d);
1165 reg_class ymm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h);
1166 reg_class zmm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p);
1167 
1168 reg_class xmm24_reg(XMM24, XMM24b, XMM24c, XMM24d);
1169 reg_class ymm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h);
1170 reg_class zmm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p);
1171 
1172 reg_class xmm25_reg(XMM25, XMM25b, XMM25c, XMM25d);
1173 reg_class ymm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h);
1174 reg_class zmm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p);
1175 
1176 reg_class xmm26_reg(XMM26, XMM26b, XMM26c, XMM26d);
1177 reg_class ymm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h);
1178 reg_class zmm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p);
1179 
1180 reg_class xmm27_reg(XMM27, XMM27b, XMM27c, XMM27d);
1181 reg_class ymm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h);
1182 reg_class zmm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p);
1183 
1184 reg_class xmm28_reg(XMM28, XMM28b, XMM28c, XMM28d);
1185 reg_class ymm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h);
1186 reg_class zmm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p);
1187 
1188 reg_class xmm29_reg(XMM29, XMM29b, XMM29c, XMM29d);
1189 reg_class ymm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h);
1190 reg_class zmm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p);
1191 
1192 reg_class xmm30_reg(XMM30, XMM30b, XMM30c, XMM30d);
1193 reg_class ymm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h);
1194 reg_class zmm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p);
1195 
1196 reg_class xmm31_reg(XMM31, XMM31b, XMM31c, XMM31d);
1197 reg_class ymm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
1198 reg_class zmm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
1199 
1200 #endif
1201 
1202 %}
1203 
1204 
1205 //----------SOURCE BLOCK-------------------------------------------------------
1206 // This is a block of C++ code which provides values, functions, and
1207 // definitions necessary in the rest of the architecture description
1208 
1209 source_hpp %{
1210 // Header information of the source block.
1211 // Method declarations/definitions which are used outside
1212 // the ad-scope can conveniently be defined here.
1213 //
1214 // To keep related declarations/definitions/uses close together,
1215 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1216 
1217 class NativeJump;
1218 
1219 class CallStubImpl {
1220 
1221   //--------------------------------------------------------------
1222   //---<  Used for optimization in Compile::shorten_branches  >---
1223   //--------------------------------------------------------------
1224 
1225  public:
1226   // Size of call trampoline stub.
1227   static uint size_call_trampoline() {
1228     return 0; // no call trampolines on this platform
1229   }
1230 
1231   // number of relocations needed by a call trampoline stub
1232   static uint reloc_call_trampoline() {
1233     return 0; // no call trampolines on this platform
1234   }
1235 };
1236 
1237 class HandlerImpl {
1238 
1239  public:
1240 
1241   static int emit_exception_handler(CodeBuffer &cbuf);
1242   static int emit_deopt_handler(CodeBuffer& cbuf);
1243 
1244   static uint size_exception_handler() {
1245     // NativeCall instruction size is the same as NativeJump.
1246     // exception handler starts out as jump and can be patched to
1247     // a call be deoptimization.  (4932387)
1248     // Note that this value is also credited (in output.cpp) to
1249     // the size of the code section.
1250     return NativeJump::instruction_size;
1251   }
1252 
1253 #ifdef _LP64
1254   static uint size_deopt_handler() {
1255     // three 5 byte instructions
1256     return 15;
1257   }
1258 #else
1259   static uint size_deopt_handler() {
1260     // NativeCall instruction size is the same as NativeJump.
1261     // exception handler starts out as jump and can be patched to
1262     // a call be deoptimization.  (4932387)
1263     // Note that this value is also credited (in output.cpp) to
1264     // the size of the code section.
1265     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1266   }
1267 #endif
1268 };
1269 
1270 %} // end source_hpp
1271 
1272 source %{
1273 
1274 #include "opto/addnode.hpp"
1275 
1276 // Emit exception handler code.
1277 // Stuff framesize into a register and call a VM stub routine.
1278 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1279 
1280   // Note that the code buffer's insts_mark is always relative to insts.
1281   // That's why we must use the macroassembler to generate a handler.
1282   MacroAssembler _masm(&cbuf);
1283   address base = __ start_a_stub(size_exception_handler());
1284   if (base == NULL) {
1285     ciEnv::current()->record_failure("CodeCache is full");
1286     return 0;  // CodeBuffer::expand failed
1287   }
1288   int offset = __ offset();
1289   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1290   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1291   __ end_a_stub();
1292   return offset;
1293 }
1294 
1295 // Emit deopt handler code.
1296 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1297 
1298   // Note that the code buffer's insts_mark is always relative to insts.
1299   // That's why we must use the macroassembler to generate a handler.
1300   MacroAssembler _masm(&cbuf);
1301   address base = __ start_a_stub(size_deopt_handler());
1302   if (base == NULL) {
1303     ciEnv::current()->record_failure("CodeCache is full");
1304     return 0;  // CodeBuffer::expand failed
1305   }
1306   int offset = __ offset();
1307 
1308 #ifdef _LP64
1309   address the_pc = (address) __ pc();
1310   Label next;
1311   // push a "the_pc" on the stack without destroying any registers
1312   // as they all may be live.
1313 
1314   // push address of "next"
1315   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1316   __ bind(next);
1317   // adjust it so it matches "the_pc"
1318   __ subptr(Address(rsp, 0), __ offset() - offset);
1319 #else
1320   InternalAddress here(__ pc());
1321   __ pushptr(here.addr());
1322 #endif
1323 
1324   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1325   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
1326   __ end_a_stub();
1327   return offset;
1328 }
1329 
1330 
1331 //=============================================================================
1332 
1333   // Float masks come from different places depending on platform.
1334 #ifdef _LP64
1335   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1336   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1337   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1338   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1339 #else
1340   static address float_signmask()  { return (address)float_signmask_pool; }
1341   static address float_signflip()  { return (address)float_signflip_pool; }
1342   static address double_signmask() { return (address)double_signmask_pool; }
1343   static address double_signflip() { return (address)double_signflip_pool; }
1344 #endif
1345 
1346 
1347 const bool Matcher::match_rule_supported(int opcode) {
1348   if (!has_match_rule(opcode))
1349     return false;
1350 
1351   bool ret_value = true;
1352   switch (opcode) {
1353     case Op_PopCountI:
1354     case Op_PopCountL:
1355       if (!UsePopCountInstruction)
1356         ret_value = false;
1357       break;
1358     case Op_PopCountVI:
1359       if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
1360         ret_value = false;
1361       break;
1362     case Op_MulVI:
1363       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1364         ret_value = false;
1365       break;
1366     case Op_MulVL:
1367     case Op_MulReductionVL:
1368       if (VM_Version::supports_avx512dq() == false)
1369         ret_value = false;
1370       break;
1371     case Op_AddReductionVL:
1372       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
1373         ret_value = false;
1374       break;
1375     case Op_AddReductionVI:
1376       if (UseSSE < 3) // requires at least SSE3
1377         ret_value = false;
1378       break;
1379     case Op_MulReductionVI:
1380       if (UseSSE < 4) // requires at least SSE4
1381         ret_value = false;
1382       break;
1383     case Op_AddReductionVF:
1384     case Op_AddReductionVD:
1385     case Op_MulReductionVF:
1386     case Op_MulReductionVD:
1387       if (UseSSE < 1) // requires at least SSE
1388         ret_value = false;
1389       break;
1390     case Op_SqrtVD:
1391     case Op_SqrtVF:
1392       if (UseAVX < 1) // enabled for AVX only
1393         ret_value = false;
1394       break;
1395     case Op_CompareAndSwapL:
1396 #ifdef _LP64
1397     case Op_CompareAndSwapP:
1398 #endif
1399       if (!VM_Version::supports_cx8())
1400         ret_value = false;
1401       break;
1402     case Op_CMoveVF:
1403     case Op_CMoveVD:
1404       if (UseAVX < 1 || UseAVX > 2)
1405         ret_value = false;
1406       break;
1407     case Op_StrIndexOf:
1408       if (!UseSSE42Intrinsics)
1409         ret_value = false;
1410       break;
1411     case Op_StrIndexOfChar:
1412       if (!UseSSE42Intrinsics)
1413         ret_value = false;
1414       break;
1415     case Op_OnSpinWait:
1416       if (VM_Version::supports_on_spin_wait() == false)
1417         ret_value = false;
1418       break;
1419   }
1420 
1421   return ret_value;  // Per default match rules are supported.
1422 }
1423 
1424 const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
1425   // identify extra cases that we might want to provide match rules for
1426   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1427   bool ret_value = match_rule_supported(opcode);
1428   if (ret_value) {
1429     switch (opcode) {
1430       case Op_AddVB:
1431       case Op_SubVB:
1432         if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1433           ret_value = false;
1434         break;
1435       case Op_URShiftVS:
1436       case Op_RShiftVS:
1437       case Op_LShiftVS:
1438       case Op_MulVS:
1439       case Op_AddVS:
1440       case Op_SubVS:
1441         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1442           ret_value = false;
1443         break;
1444       case Op_CMoveVF:
1445         if (vlen != 8)
1446           ret_value  = false;
1447       case Op_CMoveVD:
1448         if (vlen != 4)
1449           ret_value  = false;
1450         break;
1451     }
1452   }
1453 
1454   return ret_value;  // Per default match rules are supported.
1455 }
1456 
1457 const bool Matcher::has_predicated_vectors(void) {
1458   bool ret_value = false;
1459   if (UseAVX > 2) {
1460     ret_value = VM_Version::supports_avx512vl();
1461   }
1462 
1463   return ret_value;
1464 }
1465 
1466 const int Matcher::float_pressure(int default_pressure_threshold) {
1467   int float_pressure_threshold = default_pressure_threshold;
1468 #ifdef _LP64
1469   if (UseAVX > 2) {
1470     // Increase pressure threshold on machines with AVX3 which have
1471     // 2x more XMM registers.
1472     float_pressure_threshold = default_pressure_threshold * 2;
1473   }
1474 #endif
1475   return float_pressure_threshold;
1476 }
1477 
1478 // Max vector size in bytes. 0 if not supported.
1479 const int Matcher::vector_width_in_bytes(BasicType bt) {
1480   assert(is_java_primitive(bt), "only primitive type vectors");
1481   if (UseSSE < 2) return 0;
1482   // SSE2 supports 128bit vectors for all types.
1483   // AVX2 supports 256bit vectors for all types.
1484   // AVX2/EVEX supports 512bit vectors for all types.
1485   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1486   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1487   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1488     size = (UseAVX > 2) ? 64 : 32;
1489   // Use flag to limit vector size.
1490   size = MIN2(size,(int)MaxVectorSize);
1491   // Minimum 2 values in vector (or 4 for bytes).
1492   switch (bt) {
1493   case T_DOUBLE:
1494   case T_LONG:
1495     if (size < 16) return 0;
1496     break;
1497   case T_FLOAT:
1498   case T_INT:
1499     if (size < 8) return 0;
1500     break;
1501   case T_BOOLEAN:
1502     if (size < 4) return 0;
1503     break;
1504   case T_CHAR:
1505     if (size < 4) return 0;
1506     break;
1507   case T_BYTE:
1508     if (size < 4) return 0;
1509     break;
1510   case T_SHORT:
1511     if (size < 4) return 0;
1512     break;
1513   default:
1514     ShouldNotReachHere();
1515   }
1516   return size;
1517 }
1518 
1519 // Limits on vector size (number of elements) loaded into vector.
1520 const int Matcher::max_vector_size(const BasicType bt) {
1521   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1522 }
1523 const int Matcher::min_vector_size(const BasicType bt) {
1524   int max_size = max_vector_size(bt);
1525   // Min size which can be loaded into vector is 4 bytes.
1526   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1527   return MIN2(size,max_size);
1528 }
1529 
1530 // Vector ideal reg corresponding to specidied size in bytes
1531 const uint Matcher::vector_ideal_reg(int size) {
1532   assert(MaxVectorSize >= size, "");
1533   switch(size) {
1534     case  4: return Op_VecS;
1535     case  8: return Op_VecD;
1536     case 16: return Op_VecX;
1537     case 32: return Op_VecY;
1538     case 64: return Op_VecZ;
1539   }
1540   ShouldNotReachHere();
1541   return 0;
1542 }
1543 
1544 // Only lowest bits of xmm reg are used for vector shift count.
1545 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1546   return Op_VecS;
1547 }
1548 
1549 // x86 supports misaligned vectors store/load.
1550 const bool Matcher::misaligned_vectors_ok() {
1551   return !AlignVector; // can be changed by flag
1552 }
1553 
1554 // x86 AES instructions are compatible with SunJCE expanded
1555 // keys, hence we do not need to pass the original key to stubs
1556 const bool Matcher::pass_original_key_for_aes() {
1557   return false;
1558 }
1559 
1560 
1561 const bool Matcher::convi2l_type_required = true;
1562 
1563 // Check for shift by small constant as well
1564 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1565   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1566       shift->in(2)->get_int() <= 3 &&
1567       // Are there other uses besides address expressions?
1568       !matcher->is_visited(shift)) {
1569     address_visited.set(shift->_idx); // Flag as address_visited
1570     mstack.push(shift->in(2), Matcher::Visit);
1571     Node *conv = shift->in(1);
1572 #ifdef _LP64
1573     // Allow Matcher to match the rule which bypass
1574     // ConvI2L operation for an array index on LP64
1575     // if the index value is positive.
1576     if (conv->Opcode() == Op_ConvI2L &&
1577         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1578         // Are there other uses besides address expressions?
1579         !matcher->is_visited(conv)) {
1580       address_visited.set(conv->_idx); // Flag as address_visited
1581       mstack.push(conv->in(1), Matcher::Pre_Visit);
1582     } else
1583 #endif
1584       mstack.push(conv, Matcher::Pre_Visit);
1585     return true;
1586   }
1587   return false;
1588 }
1589 
1590 // Should the Matcher clone shifts on addressing modes, expecting them
1591 // to be subsumed into complex addressing expressions or compute them
1592 // into registers?
1593 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1594   Node *off = m->in(AddPNode::Offset);
1595   if (off->is_Con()) {
1596     address_visited.test_set(m->_idx); // Flag as address_visited
1597     Node *adr = m->in(AddPNode::Address);
1598 
1599     // Intel can handle 2 adds in addressing mode
1600     // AtomicAdd is not an addressing expression.
1601     // Cheap to find it by looking for screwy base.
1602     if (adr->is_AddP() &&
1603         !adr->in(AddPNode::Base)->is_top() &&
1604         // Are there other uses besides address expressions?
1605         !is_visited(adr)) {
1606       address_visited.set(adr->_idx); // Flag as address_visited
1607       Node *shift = adr->in(AddPNode::Offset);
1608       if (!clone_shift(shift, this, mstack, address_visited)) {
1609         mstack.push(shift, Pre_Visit);
1610       }
1611       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1612       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1613     } else {
1614       mstack.push(adr, Pre_Visit);
1615     }
1616 
1617     // Clone X+offset as it also folds into most addressing expressions
1618     mstack.push(off, Visit);
1619     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1620     return true;
1621   } else if (clone_shift(off, this, mstack, address_visited)) {
1622     address_visited.test_set(m->_idx); // Flag as address_visited
1623     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1624     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1625     return true;
1626   }
1627   return false;
1628 }
1629 
1630 void Compile::reshape_address(AddPNode* addp) {
1631 }
1632 
1633 // Helper methods for MachSpillCopyNode::implementation().
1634 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1635                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1636   // In 64-bit VM size calculation is very complex. Emitting instructions
1637   // into scratch buffer is used to get size in 64-bit VM.
1638   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1639   assert(ireg == Op_VecS || // 32bit vector
1640          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1641          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1642          "no non-adjacent vector moves" );
1643   if (cbuf) {
1644     MacroAssembler _masm(cbuf);
1645     int offset = __ offset();
1646     switch (ireg) {
1647     case Op_VecS: // copy whole register
1648     case Op_VecD:
1649     case Op_VecX:
1650       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1651       break;
1652     case Op_VecY:
1653       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1654       break;
1655     case Op_VecZ:
1656       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1657       break;
1658     default:
1659       ShouldNotReachHere();
1660     }
1661     int size = __ offset() - offset;
1662 #ifdef ASSERT
1663     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1664     assert(!do_size || size == 4, "incorrect size calculattion");
1665 #endif
1666     return size;
1667 #ifndef PRODUCT
1668   } else if (!do_size) {
1669     switch (ireg) {
1670     case Op_VecS:
1671     case Op_VecD:
1672     case Op_VecX:
1673       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1674       break;
1675     case Op_VecY:
1676     case Op_VecZ:
1677       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1678       break;
1679     default:
1680       ShouldNotReachHere();
1681     }
1682 #endif
1683   }
1684   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1685   return (UseAVX > 2) ? 6 : 4;
1686 }
1687 
1688 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1689                             int stack_offset, int reg, uint ireg, outputStream* st) {
1690   // In 64-bit VM size calculation is very complex. Emitting instructions
1691   // into scratch buffer is used to get size in 64-bit VM.
1692   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1693   if (cbuf) {
1694     MacroAssembler _masm(cbuf);
1695     int offset = __ offset();
1696     if (is_load) {
1697       switch (ireg) {
1698       case Op_VecS:
1699         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1700         break;
1701       case Op_VecD:
1702         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1703         break;
1704       case Op_VecX:
1705         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1706         break;
1707       case Op_VecY:
1708         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1709         break;
1710       case Op_VecZ:
1711         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1712         break;
1713       default:
1714         ShouldNotReachHere();
1715       }
1716     } else { // store
1717       switch (ireg) {
1718       case Op_VecS:
1719         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1720         break;
1721       case Op_VecD:
1722         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1723         break;
1724       case Op_VecX:
1725         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1726         break;
1727       case Op_VecY:
1728         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1729         break;
1730       case Op_VecZ:
1731         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1732         break;
1733       default:
1734         ShouldNotReachHere();
1735       }
1736     }
1737     int size = __ offset() - offset;
1738 #ifdef ASSERT
1739     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
1740     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1741     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
1742 #endif
1743     return size;
1744 #ifndef PRODUCT
1745   } else if (!do_size) {
1746     if (is_load) {
1747       switch (ireg) {
1748       case Op_VecS:
1749         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1750         break;
1751       case Op_VecD:
1752         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1753         break;
1754        case Op_VecX:
1755         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1756         break;
1757       case Op_VecY:
1758       case Op_VecZ:
1759         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1760         break;
1761       default:
1762         ShouldNotReachHere();
1763       }
1764     } else { // store
1765       switch (ireg) {
1766       case Op_VecS:
1767         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1768         break;
1769       case Op_VecD:
1770         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1771         break;
1772        case Op_VecX:
1773         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1774         break;
1775       case Op_VecY:
1776       case Op_VecZ:
1777         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1778         break;
1779       default:
1780         ShouldNotReachHere();
1781       }
1782     }
1783 #endif
1784   }
1785   bool is_single_byte = false;
1786   int vec_len = 0;
1787   if ((UseAVX > 2) && (stack_offset != 0)) {
1788     int tuple_type = Assembler::EVEX_FVM;
1789     int input_size = Assembler::EVEX_32bit;
1790     switch (ireg) {
1791     case Op_VecS:
1792       tuple_type = Assembler::EVEX_T1S;
1793       break;
1794     case Op_VecD:
1795       tuple_type = Assembler::EVEX_T1S;
1796       input_size = Assembler::EVEX_64bit;
1797       break;
1798     case Op_VecX:
1799       break;
1800     case Op_VecY:
1801       vec_len = 1;
1802       break;
1803     case Op_VecZ:
1804       vec_len = 2;
1805       break;
1806     }
1807     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
1808   }
1809   int offset_size = 0;
1810   int size = 5;
1811   if (UseAVX > 2 ) {
1812     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
1813       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1814       size += 2; // Need an additional two bytes for EVEX encoding
1815     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
1816       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1817     } else {
1818       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1819       size += 2; // Need an additional two bytes for EVEX encodding
1820     }
1821   } else {
1822     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1823   }
1824   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1825   return size+offset_size;
1826 }
1827 
1828 static inline jint replicate4_imm(int con, int width) {
1829   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
1830   assert(width == 1 || width == 2, "only byte or short types here");
1831   int bit_width = width * 8;
1832   jint val = con;
1833   val &= (1 << bit_width) - 1;  // mask off sign bits
1834   while(bit_width < 32) {
1835     val |= (val << bit_width);
1836     bit_width <<= 1;
1837   }
1838   return val;
1839 }
1840 
1841 static inline jlong replicate8_imm(int con, int width) {
1842   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
1843   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
1844   int bit_width = width * 8;
1845   jlong val = con;
1846   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
1847   while(bit_width < 64) {
1848     val |= (val << bit_width);
1849     bit_width <<= 1;
1850   }
1851   return val;
1852 }
1853 
1854 #ifndef PRODUCT
1855   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
1856     st->print("nop \t# %d bytes pad for loops and calls", _count);
1857   }
1858 #endif
1859 
1860   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
1861     MacroAssembler _masm(&cbuf);
1862     __ nop(_count);
1863   }
1864 
1865   uint MachNopNode::size(PhaseRegAlloc*) const {
1866     return _count;
1867   }
1868 
1869 #ifndef PRODUCT
1870   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
1871     st->print("# breakpoint");
1872   }
1873 #endif
1874 
1875   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
1876     MacroAssembler _masm(&cbuf);
1877     __ int3();
1878   }
1879 
1880   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
1881     return MachNode::size(ra_);
1882   }
1883 
1884 %}
1885 
1886 encode %{
1887 
1888   enc_class call_epilog %{
1889     if (VerifyStackAtCalls) {
1890       // Check that stack depth is unchanged: find majik cookie on stack
1891       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
1892       MacroAssembler _masm(&cbuf);
1893       Label L;
1894       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
1895       __ jccb(Assembler::equal, L);
1896       // Die if stack mismatch
1897       __ int3();
1898       __ bind(L);
1899     }
1900   %}
1901 
1902 %}
1903 
1904 
1905 //----------OPERANDS-----------------------------------------------------------
1906 // Operand definitions must precede instruction definitions for correct parsing
1907 // in the ADLC because operands constitute user defined types which are used in
1908 // instruction definitions.
1909 
1910 // This one generically applies only for evex, so only one version
1911 operand vecZ() %{
1912   constraint(ALLOC_IN_RC(vectorz_reg));
1913   match(VecZ);
1914 
1915   format %{ %}
1916   interface(REG_INTER);
1917 %}
1918 
1919 // Comparison Code for FP conditional move
1920 operand cmpOp_vcmppd() %{
1921   match(Bool);
1922 
1923   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
1924             n->as_Bool()->_test._test != BoolTest::no_overflow);
1925   format %{ "" %}
1926   interface(COND_INTER) %{
1927     equal        (0x0, "eq");
1928     less         (0x1, "lt");
1929     less_equal   (0x2, "le");
1930     not_equal    (0xC, "ne");
1931     greater_equal(0xD, "ge");
1932     greater      (0xE, "gt");
1933     //TODO cannot compile (adlc breaks) without two next lines with error:
1934     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
1935     // equal' for overflow.
1936     overflow     (0x20, "o");  // not really supported by the instruction
1937     no_overflow  (0x21, "no"); // not really supported by the instruction
1938   %}
1939 %}
1940 
1941 
1942 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
1943 
1944 // ============================================================================
1945 
1946 instruct ShouldNotReachHere() %{
1947   match(Halt);
1948   format %{ "ud2\t# ShouldNotReachHere" %}
1949   ins_encode %{
1950     __ ud2();
1951   %}
1952   ins_pipe(pipe_slow);
1953 %}
1954 
1955 // =================================EVEX special===============================
1956 
1957 instruct setMask(rRegI dst, rRegI src) %{
1958   predicate(Matcher::has_predicated_vectors());
1959   match(Set dst (SetVectMaskI  src));
1960   effect(TEMP dst);
1961   format %{ "setvectmask   $dst, $src" %}
1962   ins_encode %{
1963     __ setvectmask($dst$$Register, $src$$Register);
1964   %}
1965   ins_pipe(pipe_slow);
1966 %}
1967 
1968 // ============================================================================
1969 
1970 instruct addF_reg(regF dst, regF src) %{
1971   predicate((UseSSE>=1) && (UseAVX == 0));
1972   match(Set dst (AddF dst src));
1973 
1974   format %{ "addss   $dst, $src" %}
1975   ins_cost(150);
1976   ins_encode %{
1977     __ addss($dst$$XMMRegister, $src$$XMMRegister);
1978   %}
1979   ins_pipe(pipe_slow);
1980 %}
1981 
1982 instruct addF_mem(regF dst, memory src) %{
1983   predicate((UseSSE>=1) && (UseAVX == 0));
1984   match(Set dst (AddF dst (LoadF src)));
1985 
1986   format %{ "addss   $dst, $src" %}
1987   ins_cost(150);
1988   ins_encode %{
1989     __ addss($dst$$XMMRegister, $src$$Address);
1990   %}
1991   ins_pipe(pipe_slow);
1992 %}
1993 
1994 instruct addF_imm(regF dst, immF con) %{
1995   predicate((UseSSE>=1) && (UseAVX == 0));
1996   match(Set dst (AddF dst con));
1997   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1998   ins_cost(150);
1999   ins_encode %{
2000     __ addss($dst$$XMMRegister, $constantaddress($con));
2001   %}
2002   ins_pipe(pipe_slow);
2003 %}
2004 
2005 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2006   predicate(UseAVX > 0);
2007   match(Set dst (AddF src1 src2));
2008 
2009   format %{ "vaddss  $dst, $src1, $src2" %}
2010   ins_cost(150);
2011   ins_encode %{
2012     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2013   %}
2014   ins_pipe(pipe_slow);
2015 %}
2016 
2017 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2018   predicate(UseAVX > 0);
2019   match(Set dst (AddF src1 (LoadF src2)));
2020 
2021   format %{ "vaddss  $dst, $src1, $src2" %}
2022   ins_cost(150);
2023   ins_encode %{
2024     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2025   %}
2026   ins_pipe(pipe_slow);
2027 %}
2028 
2029 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2030   predicate(UseAVX > 0);
2031   match(Set dst (AddF src con));
2032 
2033   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2034   ins_cost(150);
2035   ins_encode %{
2036     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2037   %}
2038   ins_pipe(pipe_slow);
2039 %}
2040 
2041 instruct addD_reg(regD dst, regD src) %{
2042   predicate((UseSSE>=2) && (UseAVX == 0));
2043   match(Set dst (AddD dst src));
2044 
2045   format %{ "addsd   $dst, $src" %}
2046   ins_cost(150);
2047   ins_encode %{
2048     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2049   %}
2050   ins_pipe(pipe_slow);
2051 %}
2052 
2053 instruct addD_mem(regD dst, memory src) %{
2054   predicate((UseSSE>=2) && (UseAVX == 0));
2055   match(Set dst (AddD dst (LoadD src)));
2056 
2057   format %{ "addsd   $dst, $src" %}
2058   ins_cost(150);
2059   ins_encode %{
2060     __ addsd($dst$$XMMRegister, $src$$Address);
2061   %}
2062   ins_pipe(pipe_slow);
2063 %}
2064 
2065 instruct addD_imm(regD dst, immD con) %{
2066   predicate((UseSSE>=2) && (UseAVX == 0));
2067   match(Set dst (AddD dst con));
2068   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2069   ins_cost(150);
2070   ins_encode %{
2071     __ addsd($dst$$XMMRegister, $constantaddress($con));
2072   %}
2073   ins_pipe(pipe_slow);
2074 %}
2075 
2076 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2077   predicate(UseAVX > 0);
2078   match(Set dst (AddD src1 src2));
2079 
2080   format %{ "vaddsd  $dst, $src1, $src2" %}
2081   ins_cost(150);
2082   ins_encode %{
2083     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2084   %}
2085   ins_pipe(pipe_slow);
2086 %}
2087 
2088 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2089   predicate(UseAVX > 0);
2090   match(Set dst (AddD src1 (LoadD src2)));
2091 
2092   format %{ "vaddsd  $dst, $src1, $src2" %}
2093   ins_cost(150);
2094   ins_encode %{
2095     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2096   %}
2097   ins_pipe(pipe_slow);
2098 %}
2099 
2100 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2101   predicate(UseAVX > 0);
2102   match(Set dst (AddD src con));
2103 
2104   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2105   ins_cost(150);
2106   ins_encode %{
2107     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2108   %}
2109   ins_pipe(pipe_slow);
2110 %}
2111 
2112 instruct subF_reg(regF dst, regF src) %{
2113   predicate((UseSSE>=1) && (UseAVX == 0));
2114   match(Set dst (SubF dst src));
2115 
2116   format %{ "subss   $dst, $src" %}
2117   ins_cost(150);
2118   ins_encode %{
2119     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2120   %}
2121   ins_pipe(pipe_slow);
2122 %}
2123 
2124 instruct subF_mem(regF dst, memory src) %{
2125   predicate((UseSSE>=1) && (UseAVX == 0));
2126   match(Set dst (SubF dst (LoadF src)));
2127 
2128   format %{ "subss   $dst, $src" %}
2129   ins_cost(150);
2130   ins_encode %{
2131     __ subss($dst$$XMMRegister, $src$$Address);
2132   %}
2133   ins_pipe(pipe_slow);
2134 %}
2135 
2136 instruct subF_imm(regF dst, immF con) %{
2137   predicate((UseSSE>=1) && (UseAVX == 0));
2138   match(Set dst (SubF dst con));
2139   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2140   ins_cost(150);
2141   ins_encode %{
2142     __ subss($dst$$XMMRegister, $constantaddress($con));
2143   %}
2144   ins_pipe(pipe_slow);
2145 %}
2146 
2147 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2148   predicate(UseAVX > 0);
2149   match(Set dst (SubF src1 src2));
2150 
2151   format %{ "vsubss  $dst, $src1, $src2" %}
2152   ins_cost(150);
2153   ins_encode %{
2154     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2155   %}
2156   ins_pipe(pipe_slow);
2157 %}
2158 
2159 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2160   predicate(UseAVX > 0);
2161   match(Set dst (SubF src1 (LoadF src2)));
2162 
2163   format %{ "vsubss  $dst, $src1, $src2" %}
2164   ins_cost(150);
2165   ins_encode %{
2166     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2167   %}
2168   ins_pipe(pipe_slow);
2169 %}
2170 
2171 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2172   predicate(UseAVX > 0);
2173   match(Set dst (SubF src con));
2174 
2175   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2176   ins_cost(150);
2177   ins_encode %{
2178     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2179   %}
2180   ins_pipe(pipe_slow);
2181 %}
2182 
2183 instruct subD_reg(regD dst, regD src) %{
2184   predicate((UseSSE>=2) && (UseAVX == 0));
2185   match(Set dst (SubD dst src));
2186 
2187   format %{ "subsd   $dst, $src" %}
2188   ins_cost(150);
2189   ins_encode %{
2190     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2191   %}
2192   ins_pipe(pipe_slow);
2193 %}
2194 
2195 instruct subD_mem(regD dst, memory src) %{
2196   predicate((UseSSE>=2) && (UseAVX == 0));
2197   match(Set dst (SubD dst (LoadD src)));
2198 
2199   format %{ "subsd   $dst, $src" %}
2200   ins_cost(150);
2201   ins_encode %{
2202     __ subsd($dst$$XMMRegister, $src$$Address);
2203   %}
2204   ins_pipe(pipe_slow);
2205 %}
2206 
2207 instruct subD_imm(regD dst, immD con) %{
2208   predicate((UseSSE>=2) && (UseAVX == 0));
2209   match(Set dst (SubD dst con));
2210   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2211   ins_cost(150);
2212   ins_encode %{
2213     __ subsd($dst$$XMMRegister, $constantaddress($con));
2214   %}
2215   ins_pipe(pipe_slow);
2216 %}
2217 
2218 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2219   predicate(UseAVX > 0);
2220   match(Set dst (SubD src1 src2));
2221 
2222   format %{ "vsubsd  $dst, $src1, $src2" %}
2223   ins_cost(150);
2224   ins_encode %{
2225     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2226   %}
2227   ins_pipe(pipe_slow);
2228 %}
2229 
2230 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2231   predicate(UseAVX > 0);
2232   match(Set dst (SubD src1 (LoadD src2)));
2233 
2234   format %{ "vsubsd  $dst, $src1, $src2" %}
2235   ins_cost(150);
2236   ins_encode %{
2237     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2238   %}
2239   ins_pipe(pipe_slow);
2240 %}
2241 
2242 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2243   predicate(UseAVX > 0);
2244   match(Set dst (SubD src con));
2245 
2246   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2247   ins_cost(150);
2248   ins_encode %{
2249     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2250   %}
2251   ins_pipe(pipe_slow);
2252 %}
2253 
2254 instruct mulF_reg(regF dst, regF src) %{
2255   predicate((UseSSE>=1) && (UseAVX == 0));
2256   match(Set dst (MulF dst src));
2257 
2258   format %{ "mulss   $dst, $src" %}
2259   ins_cost(150);
2260   ins_encode %{
2261     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2262   %}
2263   ins_pipe(pipe_slow);
2264 %}
2265 
2266 instruct mulF_mem(regF dst, memory src) %{
2267   predicate((UseSSE>=1) && (UseAVX == 0));
2268   match(Set dst (MulF dst (LoadF src)));
2269 
2270   format %{ "mulss   $dst, $src" %}
2271   ins_cost(150);
2272   ins_encode %{
2273     __ mulss($dst$$XMMRegister, $src$$Address);
2274   %}
2275   ins_pipe(pipe_slow);
2276 %}
2277 
2278 instruct mulF_imm(regF dst, immF con) %{
2279   predicate((UseSSE>=1) && (UseAVX == 0));
2280   match(Set dst (MulF dst con));
2281   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2282   ins_cost(150);
2283   ins_encode %{
2284     __ mulss($dst$$XMMRegister, $constantaddress($con));
2285   %}
2286   ins_pipe(pipe_slow);
2287 %}
2288 
2289 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2290   predicate(UseAVX > 0);
2291   match(Set dst (MulF src1 src2));
2292 
2293   format %{ "vmulss  $dst, $src1, $src2" %}
2294   ins_cost(150);
2295   ins_encode %{
2296     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2297   %}
2298   ins_pipe(pipe_slow);
2299 %}
2300 
2301 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2302   predicate(UseAVX > 0);
2303   match(Set dst (MulF src1 (LoadF src2)));
2304 
2305   format %{ "vmulss  $dst, $src1, $src2" %}
2306   ins_cost(150);
2307   ins_encode %{
2308     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2309   %}
2310   ins_pipe(pipe_slow);
2311 %}
2312 
2313 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2314   predicate(UseAVX > 0);
2315   match(Set dst (MulF src con));
2316 
2317   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2318   ins_cost(150);
2319   ins_encode %{
2320     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2321   %}
2322   ins_pipe(pipe_slow);
2323 %}
2324 
2325 instruct mulD_reg(regD dst, regD src) %{
2326   predicate((UseSSE>=2) && (UseAVX == 0));
2327   match(Set dst (MulD dst src));
2328 
2329   format %{ "mulsd   $dst, $src" %}
2330   ins_cost(150);
2331   ins_encode %{
2332     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2333   %}
2334   ins_pipe(pipe_slow);
2335 %}
2336 
2337 instruct mulD_mem(regD dst, memory src) %{
2338   predicate((UseSSE>=2) && (UseAVX == 0));
2339   match(Set dst (MulD dst (LoadD src)));
2340 
2341   format %{ "mulsd   $dst, $src" %}
2342   ins_cost(150);
2343   ins_encode %{
2344     __ mulsd($dst$$XMMRegister, $src$$Address);
2345   %}
2346   ins_pipe(pipe_slow);
2347 %}
2348 
2349 instruct mulD_imm(regD dst, immD con) %{
2350   predicate((UseSSE>=2) && (UseAVX == 0));
2351   match(Set dst (MulD dst con));
2352   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2353   ins_cost(150);
2354   ins_encode %{
2355     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2356   %}
2357   ins_pipe(pipe_slow);
2358 %}
2359 
2360 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2361   predicate(UseAVX > 0);
2362   match(Set dst (MulD src1 src2));
2363 
2364   format %{ "vmulsd  $dst, $src1, $src2" %}
2365   ins_cost(150);
2366   ins_encode %{
2367     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2368   %}
2369   ins_pipe(pipe_slow);
2370 %}
2371 
2372 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2373   predicate(UseAVX > 0);
2374   match(Set dst (MulD src1 (LoadD src2)));
2375 
2376   format %{ "vmulsd  $dst, $src1, $src2" %}
2377   ins_cost(150);
2378   ins_encode %{
2379     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2380   %}
2381   ins_pipe(pipe_slow);
2382 %}
2383 
2384 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2385   predicate(UseAVX > 0);
2386   match(Set dst (MulD src con));
2387 
2388   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2389   ins_cost(150);
2390   ins_encode %{
2391     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2392   %}
2393   ins_pipe(pipe_slow);
2394 %}
2395 
2396 instruct divF_reg(regF dst, regF src) %{
2397   predicate((UseSSE>=1) && (UseAVX == 0));
2398   match(Set dst (DivF dst src));
2399 
2400   format %{ "divss   $dst, $src" %}
2401   ins_cost(150);
2402   ins_encode %{
2403     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2404   %}
2405   ins_pipe(pipe_slow);
2406 %}
2407 
2408 instruct divF_mem(regF dst, memory src) %{
2409   predicate((UseSSE>=1) && (UseAVX == 0));
2410   match(Set dst (DivF dst (LoadF src)));
2411 
2412   format %{ "divss   $dst, $src" %}
2413   ins_cost(150);
2414   ins_encode %{
2415     __ divss($dst$$XMMRegister, $src$$Address);
2416   %}
2417   ins_pipe(pipe_slow);
2418 %}
2419 
2420 instruct divF_imm(regF dst, immF con) %{
2421   predicate((UseSSE>=1) && (UseAVX == 0));
2422   match(Set dst (DivF dst con));
2423   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2424   ins_cost(150);
2425   ins_encode %{
2426     __ divss($dst$$XMMRegister, $constantaddress($con));
2427   %}
2428   ins_pipe(pipe_slow);
2429 %}
2430 
2431 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2432   predicate(UseAVX > 0);
2433   match(Set dst (DivF src1 src2));
2434 
2435   format %{ "vdivss  $dst, $src1, $src2" %}
2436   ins_cost(150);
2437   ins_encode %{
2438     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2439   %}
2440   ins_pipe(pipe_slow);
2441 %}
2442 
2443 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2444   predicate(UseAVX > 0);
2445   match(Set dst (DivF src1 (LoadF src2)));
2446 
2447   format %{ "vdivss  $dst, $src1, $src2" %}
2448   ins_cost(150);
2449   ins_encode %{
2450     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2451   %}
2452   ins_pipe(pipe_slow);
2453 %}
2454 
2455 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2456   predicate(UseAVX > 0);
2457   match(Set dst (DivF src con));
2458 
2459   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2460   ins_cost(150);
2461   ins_encode %{
2462     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2463   %}
2464   ins_pipe(pipe_slow);
2465 %}
2466 
2467 instruct divD_reg(regD dst, regD src) %{
2468   predicate((UseSSE>=2) && (UseAVX == 0));
2469   match(Set dst (DivD dst src));
2470 
2471   format %{ "divsd   $dst, $src" %}
2472   ins_cost(150);
2473   ins_encode %{
2474     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2475   %}
2476   ins_pipe(pipe_slow);
2477 %}
2478 
2479 instruct divD_mem(regD dst, memory src) %{
2480   predicate((UseSSE>=2) && (UseAVX == 0));
2481   match(Set dst (DivD dst (LoadD src)));
2482 
2483   format %{ "divsd   $dst, $src" %}
2484   ins_cost(150);
2485   ins_encode %{
2486     __ divsd($dst$$XMMRegister, $src$$Address);
2487   %}
2488   ins_pipe(pipe_slow);
2489 %}
2490 
2491 instruct divD_imm(regD dst, immD con) %{
2492   predicate((UseSSE>=2) && (UseAVX == 0));
2493   match(Set dst (DivD dst con));
2494   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2495   ins_cost(150);
2496   ins_encode %{
2497     __ divsd($dst$$XMMRegister, $constantaddress($con));
2498   %}
2499   ins_pipe(pipe_slow);
2500 %}
2501 
2502 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2503   predicate(UseAVX > 0);
2504   match(Set dst (DivD src1 src2));
2505 
2506   format %{ "vdivsd  $dst, $src1, $src2" %}
2507   ins_cost(150);
2508   ins_encode %{
2509     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2510   %}
2511   ins_pipe(pipe_slow);
2512 %}
2513 
2514 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2515   predicate(UseAVX > 0);
2516   match(Set dst (DivD src1 (LoadD src2)));
2517 
2518   format %{ "vdivsd  $dst, $src1, $src2" %}
2519   ins_cost(150);
2520   ins_encode %{
2521     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2522   %}
2523   ins_pipe(pipe_slow);
2524 %}
2525 
2526 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2527   predicate(UseAVX > 0);
2528   match(Set dst (DivD src con));
2529 
2530   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2531   ins_cost(150);
2532   ins_encode %{
2533     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2534   %}
2535   ins_pipe(pipe_slow);
2536 %}
2537 
2538 instruct absF_reg(regF dst) %{
2539   predicate((UseSSE>=1) && (UseAVX == 0));
2540   match(Set dst (AbsF dst));
2541   ins_cost(150);
2542   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2543   ins_encode %{
2544     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2545   %}
2546   ins_pipe(pipe_slow);
2547 %}
2548 
2549 instruct absF_reg_reg(regF dst, regF src) %{
2550   predicate(VM_Version::supports_avxonly());
2551   match(Set dst (AbsF src));
2552   ins_cost(150);
2553   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2554   ins_encode %{
2555     int vector_len = 0;
2556     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2557               ExternalAddress(float_signmask()), vector_len);
2558   %}
2559   ins_pipe(pipe_slow);
2560 %}
2561 
2562 #ifdef _LP64
2563 instruct absF_reg_reg_evex(regF dst, regF src) %{
2564   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2565   match(Set dst (AbsF src));
2566   ins_cost(150);
2567   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2568   ins_encode %{
2569     int vector_len = 0;
2570     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2571               ExternalAddress(float_signmask()), vector_len);
2572   %}
2573   ins_pipe(pipe_slow);
2574 %}
2575 
2576 instruct absF_reg_reg_evex_special(regF dst, regF src1, regF src2) %{
2577   predicate(VM_Version::supports_avx512novl());
2578   match(Set dst (AbsF src1));
2579   effect(TEMP src2);
2580   ins_cost(150);
2581   format %{ "vabsss  $dst, $src1, $src2, [0x7fffffff]\t# abs float by sign masking" %}
2582   ins_encode %{
2583     int vector_len = 0;
2584     __ vabsss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2585               ExternalAddress(float_signmask()), vector_len);
2586   %}
2587   ins_pipe(pipe_slow);
2588 %}
2589 #else // _LP64
2590 instruct absF_reg_reg_evex(regF dst, regF src) %{
2591   predicate(UseAVX > 2);
2592   match(Set dst (AbsF src));
2593   ins_cost(150);
2594   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2595   ins_encode %{
2596     int vector_len = 0;
2597     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2598               ExternalAddress(float_signmask()), vector_len);
2599   %}
2600   ins_pipe(pipe_slow);
2601 %}
2602 #endif
2603 
2604 instruct absD_reg(regD dst) %{
2605   predicate((UseSSE>=2) && (UseAVX == 0));
2606   match(Set dst (AbsD dst));
2607   ins_cost(150);
2608   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2609             "# abs double by sign masking" %}
2610   ins_encode %{
2611     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2612   %}
2613   ins_pipe(pipe_slow);
2614 %}
2615 
2616 instruct absD_reg_reg(regD dst, regD src) %{
2617   predicate(VM_Version::supports_avxonly());
2618   match(Set dst (AbsD src));
2619   ins_cost(150);
2620   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2621             "# abs double by sign masking" %}
2622   ins_encode %{
2623     int vector_len = 0;
2624     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2625               ExternalAddress(double_signmask()), vector_len);
2626   %}
2627   ins_pipe(pipe_slow);
2628 %}
2629 
2630 #ifdef _LP64
2631 instruct absD_reg_reg_evex(regD dst, regD src) %{
2632   predicate(UseAVX > 2 && VM_Version::supports_avx512vl());
2633   match(Set dst (AbsD src));
2634   ins_cost(150);
2635   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2636             "# abs double by sign masking" %}
2637   ins_encode %{
2638     int vector_len = 0;
2639     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2640               ExternalAddress(double_signmask()), vector_len);
2641   %}
2642   ins_pipe(pipe_slow);
2643 %}
2644 
2645 instruct absD_reg_reg_evex_special(regD dst, regD src1, regD src2) %{
2646   predicate(VM_Version::supports_avx512novl());
2647   match(Set dst (AbsD src1));
2648   effect(TEMP src2);
2649   ins_cost(150);
2650   format %{ "vabssd  $dst, $src1, $src2, [0x7fffffffffffffff]\t# abs float by sign masking" %}
2651   ins_encode %{
2652     int vector_len = 0;
2653     __ vabssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister,
2654               ExternalAddress(double_signmask()), vector_len);
2655   %}
2656   ins_pipe(pipe_slow);
2657 %}
2658 #else // _LP64
2659 instruct absD_reg_reg_evex(regD dst, regD src) %{
2660   predicate(UseAVX > 2);
2661   match(Set dst (AbsD src));
2662   ins_cost(150);
2663   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2664             "# abs double by sign masking" %}
2665   ins_encode %{
2666     int vector_len = 0;
2667     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2668               ExternalAddress(double_signmask()), vector_len);
2669   %}
2670   ins_pipe(pipe_slow);
2671 %}
2672 #endif
2673 
2674 instruct negF_reg(regF dst) %{
2675   predicate((UseSSE>=1) && (UseAVX == 0));
2676   match(Set dst (NegF dst));
2677   ins_cost(150);
2678   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2679   ins_encode %{
2680     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2681   %}
2682   ins_pipe(pipe_slow);
2683 %}
2684 
2685 instruct negF_reg_reg(regF dst, regF src) %{
2686   predicate(UseAVX > 0);
2687   match(Set dst (NegF src));
2688   ins_cost(150);
2689   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2690   ins_encode %{
2691     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2692                  ExternalAddress(float_signflip()));
2693   %}
2694   ins_pipe(pipe_slow);
2695 %}
2696 
2697 instruct negD_reg(regD dst) %{
2698   predicate((UseSSE>=2) && (UseAVX == 0));
2699   match(Set dst (NegD dst));
2700   ins_cost(150);
2701   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2702             "# neg double by sign flipping" %}
2703   ins_encode %{
2704     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2705   %}
2706   ins_pipe(pipe_slow);
2707 %}
2708 
2709 instruct negD_reg_reg(regD dst, regD src) %{
2710   predicate(UseAVX > 0);
2711   match(Set dst (NegD src));
2712   ins_cost(150);
2713   format %{ "vnegatess  $dst, $src, [0x8000000000000000]\t"
2714             "# neg double by sign flipping" %}
2715   ins_encode %{
2716     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2717                  ExternalAddress(double_signflip()));
2718   %}
2719   ins_pipe(pipe_slow);
2720 %}
2721 
2722 instruct sqrtF_reg(regF dst, regF src) %{
2723   predicate(UseSSE>=1);
2724   match(Set dst (SqrtF src));
2725 
2726   format %{ "sqrtss  $dst, $src" %}
2727   ins_cost(150);
2728   ins_encode %{
2729     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2730   %}
2731   ins_pipe(pipe_slow);
2732 %}
2733 
2734 instruct sqrtF_mem(regF dst, memory src) %{
2735   predicate(UseSSE>=1);
2736   match(Set dst (SqrtF (LoadF src)));
2737 
2738   format %{ "sqrtss  $dst, $src" %}
2739   ins_cost(150);
2740   ins_encode %{
2741     __ sqrtss($dst$$XMMRegister, $src$$Address);
2742   %}
2743   ins_pipe(pipe_slow);
2744 %}
2745 
2746 instruct sqrtF_imm(regF dst, immF con) %{
2747   predicate(UseSSE>=1);
2748   match(Set dst (SqrtF con));
2749 
2750   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2751   ins_cost(150);
2752   ins_encode %{
2753     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
2754   %}
2755   ins_pipe(pipe_slow);
2756 %}
2757 
2758 instruct sqrtD_reg(regD dst, regD src) %{
2759   predicate(UseSSE>=2);
2760   match(Set dst (SqrtD src));
2761 
2762   format %{ "sqrtsd  $dst, $src" %}
2763   ins_cost(150);
2764   ins_encode %{
2765     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
2766   %}
2767   ins_pipe(pipe_slow);
2768 %}
2769 
2770 instruct sqrtD_mem(regD dst, memory src) %{
2771   predicate(UseSSE>=2);
2772   match(Set dst (SqrtD (LoadD src)));
2773 
2774   format %{ "sqrtsd  $dst, $src" %}
2775   ins_cost(150);
2776   ins_encode %{
2777     __ sqrtsd($dst$$XMMRegister, $src$$Address);
2778   %}
2779   ins_pipe(pipe_slow);
2780 %}
2781 
2782 instruct sqrtD_imm(regD dst, immD con) %{
2783   predicate(UseSSE>=2);
2784   match(Set dst (SqrtD con));
2785   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2786   ins_cost(150);
2787   ins_encode %{
2788     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
2789   %}
2790   ins_pipe(pipe_slow);
2791 %}
2792 
2793 instruct onspinwait() %{
2794   match(OnSpinWait);
2795   ins_cost(200);
2796 
2797   format %{
2798     $$template
2799     if (os::is_MP()) {
2800       $$emit$$"pause\t! membar_onspinwait"
2801     } else {
2802       $$emit$$"MEMBAR-onspinwait ! (empty encoding)"
2803     }
2804   %}
2805   ins_encode %{
2806     __ pause();
2807   %}
2808   ins_pipe(pipe_slow);
2809 %}
2810 
2811 // a * b + c
2812 instruct fmaD_reg(regD a, regD b, regD c) %{
2813   predicate(UseFMA);
2814   match(Set c (FmaD  c (Binary a b)));
2815   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
2816   ins_cost(150);
2817   ins_encode %{
2818     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2819   %}
2820   ins_pipe( pipe_slow );
2821 %}
2822 
2823 // a * b + c
2824 instruct fmaF_reg(regF a, regF b, regF c) %{
2825   predicate(UseFMA);
2826   match(Set c (FmaF  c (Binary a b)));
2827   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
2828   ins_cost(150);
2829   ins_encode %{
2830     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2831   %}
2832   ins_pipe( pipe_slow );
2833 %}
2834 
2835 // ====================VECTOR INSTRUCTIONS=====================================
2836 
2837 // Load vectors (4 bytes long)
2838 instruct loadV4(vecS dst, memory mem) %{
2839   predicate(n->as_LoadVector()->memory_size() == 4);
2840   match(Set dst (LoadVector mem));
2841   ins_cost(125);
2842   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
2843   ins_encode %{
2844     __ movdl($dst$$XMMRegister, $mem$$Address);
2845   %}
2846   ins_pipe( pipe_slow );
2847 %}
2848 
2849 // Load vectors (8 bytes long)
2850 instruct loadV8(vecD dst, memory mem) %{
2851   predicate(n->as_LoadVector()->memory_size() == 8);
2852   match(Set dst (LoadVector mem));
2853   ins_cost(125);
2854   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
2855   ins_encode %{
2856     __ movq($dst$$XMMRegister, $mem$$Address);
2857   %}
2858   ins_pipe( pipe_slow );
2859 %}
2860 
2861 // Load vectors (16 bytes long)
2862 instruct loadV16(vecX dst, memory mem) %{
2863   predicate(n->as_LoadVector()->memory_size() == 16);
2864   match(Set dst (LoadVector mem));
2865   ins_cost(125);
2866   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
2867   ins_encode %{
2868     __ movdqu($dst$$XMMRegister, $mem$$Address);
2869   %}
2870   ins_pipe( pipe_slow );
2871 %}
2872 
2873 // Load vectors (32 bytes long)
2874 instruct loadV32(vecY dst, memory mem) %{
2875   predicate(n->as_LoadVector()->memory_size() == 32);
2876   match(Set dst (LoadVector mem));
2877   ins_cost(125);
2878   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
2879   ins_encode %{
2880     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
2881   %}
2882   ins_pipe( pipe_slow );
2883 %}
2884 
2885 // Load vectors (64 bytes long)
2886 instruct loadV64_dword(vecZ dst, memory mem) %{
2887   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
2888   match(Set dst (LoadVector mem));
2889   ins_cost(125);
2890   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
2891   ins_encode %{
2892     int vector_len = 2;
2893     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
2894   %}
2895   ins_pipe( pipe_slow );
2896 %}
2897 
2898 // Load vectors (64 bytes long)
2899 instruct loadV64_qword(vecZ dst, memory mem) %{
2900   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
2901   match(Set dst (LoadVector mem));
2902   ins_cost(125);
2903   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
2904   ins_encode %{
2905     int vector_len = 2;
2906     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
2907   %}
2908   ins_pipe( pipe_slow );
2909 %}
2910 
2911 // Store vectors
2912 instruct storeV4(memory mem, vecS src) %{
2913   predicate(n->as_StoreVector()->memory_size() == 4);
2914   match(Set mem (StoreVector mem src));
2915   ins_cost(145);
2916   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
2917   ins_encode %{
2918     __ movdl($mem$$Address, $src$$XMMRegister);
2919   %}
2920   ins_pipe( pipe_slow );
2921 %}
2922 
2923 instruct storeV8(memory mem, vecD src) %{
2924   predicate(n->as_StoreVector()->memory_size() == 8);
2925   match(Set mem (StoreVector mem src));
2926   ins_cost(145);
2927   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
2928   ins_encode %{
2929     __ movq($mem$$Address, $src$$XMMRegister);
2930   %}
2931   ins_pipe( pipe_slow );
2932 %}
2933 
2934 instruct storeV16(memory mem, vecX src) %{
2935   predicate(n->as_StoreVector()->memory_size() == 16);
2936   match(Set mem (StoreVector mem src));
2937   ins_cost(145);
2938   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
2939   ins_encode %{
2940     __ movdqu($mem$$Address, $src$$XMMRegister);
2941   %}
2942   ins_pipe( pipe_slow );
2943 %}
2944 
2945 instruct storeV32(memory mem, vecY src) %{
2946   predicate(n->as_StoreVector()->memory_size() == 32);
2947   match(Set mem (StoreVector mem src));
2948   ins_cost(145);
2949   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
2950   ins_encode %{
2951     __ vmovdqu($mem$$Address, $src$$XMMRegister);
2952   %}
2953   ins_pipe( pipe_slow );
2954 %}
2955 
2956 instruct storeV64_dword(memory mem, vecZ src) %{
2957   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
2958   match(Set mem (StoreVector mem src));
2959   ins_cost(145);
2960   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
2961   ins_encode %{
2962     int vector_len = 2;
2963     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
2964   %}
2965   ins_pipe( pipe_slow );
2966 %}
2967 
2968 instruct storeV64_qword(memory mem, vecZ src) %{
2969   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
2970   match(Set mem (StoreVector mem src));
2971   ins_cost(145);
2972   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
2973   ins_encode %{
2974     int vector_len = 2;
2975     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
2976   %}
2977   ins_pipe( pipe_slow );
2978 %}
2979 
2980 // ====================LEGACY REPLICATE=======================================
2981 
2982 instruct Repl4B_mem(vecS dst, memory mem) %{
2983   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2984   match(Set dst (ReplicateB (LoadB mem)));
2985   format %{ "punpcklbw $dst,$mem\n\t"
2986             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
2987   ins_encode %{
2988     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
2989     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2990   %}
2991   ins_pipe( pipe_slow );
2992 %}
2993 
2994 instruct Repl8B_mem(vecD dst, memory mem) %{
2995   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
2996   match(Set dst (ReplicateB (LoadB mem)));
2997   format %{ "punpcklbw $dst,$mem\n\t"
2998             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
2999   ins_encode %{
3000     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3001     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3002   %}
3003   ins_pipe( pipe_slow );
3004 %}
3005 
3006 instruct Repl16B(vecX dst, rRegI src) %{
3007   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3008   match(Set dst (ReplicateB src));
3009   format %{ "movd    $dst,$src\n\t"
3010             "punpcklbw $dst,$dst\n\t"
3011             "pshuflw $dst,$dst,0x00\n\t"
3012             "punpcklqdq $dst,$dst\t! replicate16B" %}
3013   ins_encode %{
3014     __ movdl($dst$$XMMRegister, $src$$Register);
3015     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3016     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3017     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3018   %}
3019   ins_pipe( pipe_slow );
3020 %}
3021 
3022 instruct Repl16B_mem(vecX dst, memory mem) %{
3023   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3024   match(Set dst (ReplicateB (LoadB mem)));
3025   format %{ "punpcklbw $dst,$mem\n\t"
3026             "pshuflw $dst,$dst,0x00\n\t"
3027             "punpcklqdq $dst,$dst\t! replicate16B" %}
3028   ins_encode %{
3029     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3030     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3031     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3032   %}
3033   ins_pipe( pipe_slow );
3034 %}
3035 
3036 instruct Repl32B(vecY dst, rRegI src) %{
3037   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3038   match(Set dst (ReplicateB src));
3039   format %{ "movd    $dst,$src\n\t"
3040             "punpcklbw $dst,$dst\n\t"
3041             "pshuflw $dst,$dst,0x00\n\t"
3042             "punpcklqdq $dst,$dst\n\t"
3043             "vinserti128_high $dst,$dst\t! replicate32B" %}
3044   ins_encode %{
3045     __ movdl($dst$$XMMRegister, $src$$Register);
3046     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3047     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3048     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3049     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3050   %}
3051   ins_pipe( pipe_slow );
3052 %}
3053 
3054 instruct Repl32B_mem(vecY dst, memory mem) %{
3055   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3056   match(Set dst (ReplicateB (LoadB mem)));
3057   format %{ "punpcklbw $dst,$mem\n\t"
3058             "pshuflw $dst,$dst,0x00\n\t"
3059             "punpcklqdq $dst,$dst\n\t"
3060             "vinserti128_high $dst,$dst\t! replicate32B" %}
3061   ins_encode %{
3062     __ punpcklbw($dst$$XMMRegister, $mem$$Address);
3063     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3064     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3065     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3066   %}
3067   ins_pipe( pipe_slow );
3068 %}
3069 
3070 instruct Repl16B_imm(vecX dst, immI con) %{
3071   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3072   match(Set dst (ReplicateB con));
3073   format %{ "movq    $dst,[$constantaddress]\n\t"
3074             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3075   ins_encode %{
3076     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3077     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3078   %}
3079   ins_pipe( pipe_slow );
3080 %}
3081 
3082 instruct Repl32B_imm(vecY dst, immI con) %{
3083   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3084   match(Set dst (ReplicateB con));
3085   format %{ "movq    $dst,[$constantaddress]\n\t"
3086             "punpcklqdq $dst,$dst\n\t"
3087             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3088   ins_encode %{
3089     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3090     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3091     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3092   %}
3093   ins_pipe( pipe_slow );
3094 %}
3095 
3096 instruct Repl4S(vecD dst, rRegI src) %{
3097   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3098   match(Set dst (ReplicateS src));
3099   format %{ "movd    $dst,$src\n\t"
3100             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3101   ins_encode %{
3102     __ movdl($dst$$XMMRegister, $src$$Register);
3103     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3104   %}
3105   ins_pipe( pipe_slow );
3106 %}
3107 
3108 instruct Repl4S_mem(vecD dst, memory mem) %{
3109   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3110   match(Set dst (ReplicateS (LoadS mem)));
3111   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3112   ins_encode %{
3113     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3114   %}
3115   ins_pipe( pipe_slow );
3116 %}
3117 
3118 instruct Repl8S(vecX dst, rRegI src) %{
3119   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3120   match(Set dst (ReplicateS src));
3121   format %{ "movd    $dst,$src\n\t"
3122             "pshuflw $dst,$dst,0x00\n\t"
3123             "punpcklqdq $dst,$dst\t! replicate8S" %}
3124   ins_encode %{
3125     __ movdl($dst$$XMMRegister, $src$$Register);
3126     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3127     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3128   %}
3129   ins_pipe( pipe_slow );
3130 %}
3131 
3132 instruct Repl8S_mem(vecX dst, memory mem) %{
3133   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3134   match(Set dst (ReplicateS (LoadS mem)));
3135   format %{ "pshuflw $dst,$mem,0x00\n\t"
3136             "punpcklqdq $dst,$dst\t! replicate8S" %}
3137   ins_encode %{
3138     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3139     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3140   %}
3141   ins_pipe( pipe_slow );
3142 %}
3143 
3144 instruct Repl8S_imm(vecX dst, immI con) %{
3145   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3146   match(Set dst (ReplicateS con));
3147   format %{ "movq    $dst,[$constantaddress]\n\t"
3148             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3149   ins_encode %{
3150     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3151     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3152   %}
3153   ins_pipe( pipe_slow );
3154 %}
3155 
3156 instruct Repl16S(vecY dst, rRegI src) %{
3157   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3158   match(Set dst (ReplicateS src));
3159   format %{ "movd    $dst,$src\n\t"
3160             "pshuflw $dst,$dst,0x00\n\t"
3161             "punpcklqdq $dst,$dst\n\t"
3162             "vinserti128_high $dst,$dst\t! replicate16S" %}
3163   ins_encode %{
3164     __ movdl($dst$$XMMRegister, $src$$Register);
3165     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3166     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3167     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3168   %}
3169   ins_pipe( pipe_slow );
3170 %}
3171 
3172 instruct Repl16S_mem(vecY dst, memory mem) %{
3173   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3174   match(Set dst (ReplicateS (LoadS mem)));
3175   format %{ "pshuflw $dst,$mem,0x00\n\t"
3176             "punpcklqdq $dst,$dst\n\t"
3177             "vinserti128_high $dst,$dst\t! replicate16S" %}
3178   ins_encode %{
3179     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3180     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3181     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3182   %}
3183   ins_pipe( pipe_slow );
3184 %}
3185 
3186 instruct Repl16S_imm(vecY dst, immI con) %{
3187   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3188   match(Set dst (ReplicateS con));
3189   format %{ "movq    $dst,[$constantaddress]\n\t"
3190             "punpcklqdq $dst,$dst\n\t"
3191             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3192   ins_encode %{
3193     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3194     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3195     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3196   %}
3197   ins_pipe( pipe_slow );
3198 %}
3199 
3200 instruct Repl4I(vecX dst, rRegI src) %{
3201   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3202   match(Set dst (ReplicateI src));
3203   format %{ "movd    $dst,$src\n\t"
3204             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3205   ins_encode %{
3206     __ movdl($dst$$XMMRegister, $src$$Register);
3207     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3208   %}
3209   ins_pipe( pipe_slow );
3210 %}
3211 
3212 instruct Repl4I_mem(vecX dst, memory mem) %{
3213   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3214   match(Set dst (ReplicateI (LoadI mem)));
3215   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3216   ins_encode %{
3217     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3218   %}
3219   ins_pipe( pipe_slow );
3220 %}
3221 
3222 instruct Repl8I(vecY dst, rRegI src) %{
3223   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3224   match(Set dst (ReplicateI src));
3225   format %{ "movd    $dst,$src\n\t"
3226             "pshufd  $dst,$dst,0x00\n\t"
3227             "vinserti128_high $dst,$dst\t! replicate8I" %}
3228   ins_encode %{
3229     __ movdl($dst$$XMMRegister, $src$$Register);
3230     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3231     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3232   %}
3233   ins_pipe( pipe_slow );
3234 %}
3235 
3236 instruct Repl8I_mem(vecY dst, memory mem) %{
3237   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3238   match(Set dst (ReplicateI (LoadI mem)));
3239   format %{ "pshufd  $dst,$mem,0x00\n\t"
3240             "vinserti128_high $dst,$dst\t! replicate8I" %}
3241   ins_encode %{
3242     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3243     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3244   %}
3245   ins_pipe( pipe_slow );
3246 %}
3247 
3248 instruct Repl4I_imm(vecX dst, immI con) %{
3249   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3250   match(Set dst (ReplicateI con));
3251   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3252             "punpcklqdq $dst,$dst" %}
3253   ins_encode %{
3254     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3255     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3256   %}
3257   ins_pipe( pipe_slow );
3258 %}
3259 
3260 instruct Repl8I_imm(vecY dst, immI con) %{
3261   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3262   match(Set dst (ReplicateI con));
3263   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3264             "punpcklqdq $dst,$dst\n\t"
3265             "vinserti128_high $dst,$dst" %}
3266   ins_encode %{
3267     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3268     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3269     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3270   %}
3271   ins_pipe( pipe_slow );
3272 %}
3273 
3274 // Long could be loaded into xmm register directly from memory.
3275 instruct Repl2L_mem(vecX dst, memory mem) %{
3276   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3277   match(Set dst (ReplicateL (LoadL mem)));
3278   format %{ "movq    $dst,$mem\n\t"
3279             "punpcklqdq $dst,$dst\t! replicate2L" %}
3280   ins_encode %{
3281     __ movq($dst$$XMMRegister, $mem$$Address);
3282     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3283   %}
3284   ins_pipe( pipe_slow );
3285 %}
3286 
3287 // Replicate long (8 byte) scalar to be vector
3288 #ifdef _LP64
3289 instruct Repl4L(vecY dst, rRegL src) %{
3290   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3291   match(Set dst (ReplicateL src));
3292   format %{ "movdq   $dst,$src\n\t"
3293             "punpcklqdq $dst,$dst\n\t"
3294             "vinserti128_high $dst,$dst\t! replicate4L" %}
3295   ins_encode %{
3296     __ movdq($dst$$XMMRegister, $src$$Register);
3297     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3298     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3299   %}
3300   ins_pipe( pipe_slow );
3301 %}
3302 #else // _LP64
3303 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
3304   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3305   match(Set dst (ReplicateL src));
3306   effect(TEMP dst, USE src, TEMP tmp);
3307   format %{ "movdl   $dst,$src.lo\n\t"
3308             "movdl   $tmp,$src.hi\n\t"
3309             "punpckldq $dst,$tmp\n\t"
3310             "punpcklqdq $dst,$dst\n\t"
3311             "vinserti128_high $dst,$dst\t! replicate4L" %}
3312   ins_encode %{
3313     __ movdl($dst$$XMMRegister, $src$$Register);
3314     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3315     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3316     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3317     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3318   %}
3319   ins_pipe( pipe_slow );
3320 %}
3321 #endif // _LP64
3322 
3323 instruct Repl4L_imm(vecY dst, immL con) %{
3324   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3325   match(Set dst (ReplicateL con));
3326   format %{ "movq    $dst,[$constantaddress]\n\t"
3327             "punpcklqdq $dst,$dst\n\t"
3328             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3329   ins_encode %{
3330     __ movq($dst$$XMMRegister, $constantaddress($con));
3331     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3332     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3333   %}
3334   ins_pipe( pipe_slow );
3335 %}
3336 
3337 instruct Repl4L_mem(vecY dst, memory mem) %{
3338   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3339   match(Set dst (ReplicateL (LoadL mem)));
3340   format %{ "movq    $dst,$mem\n\t"
3341             "punpcklqdq $dst,$dst\n\t"
3342             "vinserti128_high $dst,$dst\t! replicate4L" %}
3343   ins_encode %{
3344     __ movq($dst$$XMMRegister, $mem$$Address);
3345     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3346     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3347   %}
3348   ins_pipe( pipe_slow );
3349 %}
3350 
3351 instruct Repl2F_mem(vecD dst, memory mem) %{
3352   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3353   match(Set dst (ReplicateF (LoadF mem)));
3354   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3355   ins_encode %{
3356     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3357   %}
3358   ins_pipe( pipe_slow );
3359 %}
3360 
3361 instruct Repl4F_mem(vecX dst, memory mem) %{
3362   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3363   match(Set dst (ReplicateF (LoadF mem)));
3364   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3365   ins_encode %{
3366     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3367   %}
3368   ins_pipe( pipe_slow );
3369 %}
3370 
3371 instruct Repl8F(vecY dst, regF src) %{
3372   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3373   match(Set dst (ReplicateF src));
3374   format %{ "pshufd  $dst,$src,0x00\n\t"
3375             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3376   ins_encode %{
3377     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3378     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3379   %}
3380   ins_pipe( pipe_slow );
3381 %}
3382 
3383 instruct Repl8F_mem(vecY dst, memory mem) %{
3384   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3385   match(Set dst (ReplicateF (LoadF mem)));
3386   format %{ "pshufd  $dst,$mem,0x00\n\t"
3387             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3388   ins_encode %{
3389     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3390     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3391   %}
3392   ins_pipe( pipe_slow );
3393 %}
3394 
3395 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3396   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3397   match(Set dst (ReplicateF zero));
3398   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3399   ins_encode %{
3400     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3401   %}
3402   ins_pipe( fpu_reg_reg );
3403 %}
3404 
3405 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3406   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3407   match(Set dst (ReplicateF zero));
3408   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3409   ins_encode %{
3410     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3411   %}
3412   ins_pipe( fpu_reg_reg );
3413 %}
3414 
3415 instruct Repl8F_zero(vecY dst, immF0 zero) %{
3416   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
3417   match(Set dst (ReplicateF zero));
3418   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3419   ins_encode %{
3420     int vector_len = 1;
3421     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3422   %}
3423   ins_pipe( fpu_reg_reg );
3424 %}
3425 
3426 instruct Repl2D_mem(vecX dst, memory mem) %{
3427   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3428   match(Set dst (ReplicateD (LoadD mem)));
3429   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3430   ins_encode %{
3431     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3432   %}
3433   ins_pipe( pipe_slow );
3434 %}
3435 
3436 instruct Repl4D(vecY dst, regD src) %{
3437   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3438   match(Set dst (ReplicateD src));
3439   format %{ "pshufd  $dst,$src,0x44\n\t"
3440             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3441   ins_encode %{
3442     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3443     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3444   %}
3445   ins_pipe( pipe_slow );
3446 %}
3447 
3448 instruct Repl4D_mem(vecY dst, memory mem) %{
3449   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3450   match(Set dst (ReplicateD (LoadD mem)));
3451   format %{ "pshufd  $dst,$mem,0x44\n\t"
3452             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3453   ins_encode %{
3454     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3455     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3456   %}
3457   ins_pipe( pipe_slow );
3458 %}
3459 
3460 // Replicate double (8 byte) scalar zero to be vector
3461 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3462   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3463   match(Set dst (ReplicateD zero));
3464   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3465   ins_encode %{
3466     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3467   %}
3468   ins_pipe( fpu_reg_reg );
3469 %}
3470 
3471 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3472   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3473   match(Set dst (ReplicateD zero));
3474   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3475   ins_encode %{
3476     int vector_len = 1;
3477     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3478   %}
3479   ins_pipe( fpu_reg_reg );
3480 %}
3481 
3482 // ====================GENERIC REPLICATE==========================================
3483 
3484 // Replicate byte scalar to be vector
3485 instruct Repl4B(vecS dst, rRegI src) %{
3486   predicate(n->as_Vector()->length() == 4);
3487   match(Set dst (ReplicateB src));
3488   format %{ "movd    $dst,$src\n\t"
3489             "punpcklbw $dst,$dst\n\t"
3490             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3491   ins_encode %{
3492     __ movdl($dst$$XMMRegister, $src$$Register);
3493     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3494     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3495   %}
3496   ins_pipe( pipe_slow );
3497 %}
3498 
3499 instruct Repl8B(vecD dst, rRegI src) %{
3500   predicate(n->as_Vector()->length() == 8);
3501   match(Set dst (ReplicateB src));
3502   format %{ "movd    $dst,$src\n\t"
3503             "punpcklbw $dst,$dst\n\t"
3504             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3505   ins_encode %{
3506     __ movdl($dst$$XMMRegister, $src$$Register);
3507     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3508     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3509   %}
3510   ins_pipe( pipe_slow );
3511 %}
3512 
3513 // Replicate byte scalar immediate to be vector by loading from const table.
3514 instruct Repl4B_imm(vecS dst, immI con) %{
3515   predicate(n->as_Vector()->length() == 4);
3516   match(Set dst (ReplicateB con));
3517   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
3518   ins_encode %{
3519     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
3520   %}
3521   ins_pipe( pipe_slow );
3522 %}
3523 
3524 instruct Repl8B_imm(vecD dst, immI con) %{
3525   predicate(n->as_Vector()->length() == 8);
3526   match(Set dst (ReplicateB con));
3527   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
3528   ins_encode %{
3529     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3530   %}
3531   ins_pipe( pipe_slow );
3532 %}
3533 
3534 // Replicate byte scalar zero to be vector
3535 instruct Repl4B_zero(vecS dst, immI0 zero) %{
3536   predicate(n->as_Vector()->length() == 4);
3537   match(Set dst (ReplicateB zero));
3538   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
3539   ins_encode %{
3540     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3541   %}
3542   ins_pipe( fpu_reg_reg );
3543 %}
3544 
3545 instruct Repl8B_zero(vecD dst, immI0 zero) %{
3546   predicate(n->as_Vector()->length() == 8);
3547   match(Set dst (ReplicateB zero));
3548   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
3549   ins_encode %{
3550     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3551   %}
3552   ins_pipe( fpu_reg_reg );
3553 %}
3554 
3555 instruct Repl16B_zero(vecX dst, immI0 zero) %{
3556   predicate(n->as_Vector()->length() == 16);
3557   match(Set dst (ReplicateB zero));
3558   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
3559   ins_encode %{
3560     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3561   %}
3562   ins_pipe( fpu_reg_reg );
3563 %}
3564 
3565 instruct Repl32B_zero(vecY dst, immI0 zero) %{
3566   predicate(n->as_Vector()->length() == 32);
3567   match(Set dst (ReplicateB zero));
3568   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
3569   ins_encode %{
3570     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3571     int vector_len = 1;
3572     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3573   %}
3574   ins_pipe( fpu_reg_reg );
3575 %}
3576 
3577 // Replicate char/short (2 byte) scalar to be vector
3578 instruct Repl2S(vecS dst, rRegI src) %{
3579   predicate(n->as_Vector()->length() == 2);
3580   match(Set dst (ReplicateS src));
3581   format %{ "movd    $dst,$src\n\t"
3582             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
3583   ins_encode %{
3584     __ movdl($dst$$XMMRegister, $src$$Register);
3585     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3586   %}
3587   ins_pipe( fpu_reg_reg );
3588 %}
3589 
3590 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
3591 instruct Repl2S_imm(vecS dst, immI con) %{
3592   predicate(n->as_Vector()->length() == 2);
3593   match(Set dst (ReplicateS con));
3594   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
3595   ins_encode %{
3596     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
3597   %}
3598   ins_pipe( fpu_reg_reg );
3599 %}
3600 
3601 instruct Repl4S_imm(vecD dst, immI con) %{
3602   predicate(n->as_Vector()->length() == 4);
3603   match(Set dst (ReplicateS con));
3604   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
3605   ins_encode %{
3606     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3607   %}
3608   ins_pipe( fpu_reg_reg );
3609 %}
3610 
3611 // Replicate char/short (2 byte) scalar zero to be vector
3612 instruct Repl2S_zero(vecS dst, immI0 zero) %{
3613   predicate(n->as_Vector()->length() == 2);
3614   match(Set dst (ReplicateS zero));
3615   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
3616   ins_encode %{
3617     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3618   %}
3619   ins_pipe( fpu_reg_reg );
3620 %}
3621 
3622 instruct Repl4S_zero(vecD dst, immI0 zero) %{
3623   predicate(n->as_Vector()->length() == 4);
3624   match(Set dst (ReplicateS zero));
3625   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
3626   ins_encode %{
3627     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3628   %}
3629   ins_pipe( fpu_reg_reg );
3630 %}
3631 
3632 instruct Repl8S_zero(vecX dst, immI0 zero) %{
3633   predicate(n->as_Vector()->length() == 8);
3634   match(Set dst (ReplicateS zero));
3635   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
3636   ins_encode %{
3637     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3638   %}
3639   ins_pipe( fpu_reg_reg );
3640 %}
3641 
3642 instruct Repl16S_zero(vecY dst, immI0 zero) %{
3643   predicate(n->as_Vector()->length() == 16);
3644   match(Set dst (ReplicateS zero));
3645   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
3646   ins_encode %{
3647     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3648     int vector_len = 1;
3649     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3650   %}
3651   ins_pipe( fpu_reg_reg );
3652 %}
3653 
3654 // Replicate integer (4 byte) scalar to be vector
3655 instruct Repl2I(vecD dst, rRegI src) %{
3656   predicate(n->as_Vector()->length() == 2);
3657   match(Set dst (ReplicateI src));
3658   format %{ "movd    $dst,$src\n\t"
3659             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
3660   ins_encode %{
3661     __ movdl($dst$$XMMRegister, $src$$Register);
3662     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3663   %}
3664   ins_pipe( fpu_reg_reg );
3665 %}
3666 
3667 // Integer could be loaded into xmm register directly from memory.
3668 instruct Repl2I_mem(vecD dst, memory mem) %{
3669   predicate(n->as_Vector()->length() == 2);
3670   match(Set dst (ReplicateI (LoadI mem)));
3671   format %{ "movd    $dst,$mem\n\t"
3672             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
3673   ins_encode %{
3674     __ movdl($dst$$XMMRegister, $mem$$Address);
3675     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3676   %}
3677   ins_pipe( fpu_reg_reg );
3678 %}
3679 
3680 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
3681 instruct Repl2I_imm(vecD dst, immI con) %{
3682   predicate(n->as_Vector()->length() == 2);
3683   match(Set dst (ReplicateI con));
3684   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
3685   ins_encode %{
3686     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3687   %}
3688   ins_pipe( fpu_reg_reg );
3689 %}
3690 
3691 // Replicate integer (4 byte) scalar zero to be vector
3692 instruct Repl2I_zero(vecD dst, immI0 zero) %{
3693   predicate(n->as_Vector()->length() == 2);
3694   match(Set dst (ReplicateI zero));
3695   format %{ "pxor    $dst,$dst\t! replicate2I" %}
3696   ins_encode %{
3697     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3698   %}
3699   ins_pipe( fpu_reg_reg );
3700 %}
3701 
3702 instruct Repl4I_zero(vecX dst, immI0 zero) %{
3703   predicate(n->as_Vector()->length() == 4);
3704   match(Set dst (ReplicateI zero));
3705   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
3706   ins_encode %{
3707     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3708   %}
3709   ins_pipe( fpu_reg_reg );
3710 %}
3711 
3712 instruct Repl8I_zero(vecY dst, immI0 zero) %{
3713   predicate(n->as_Vector()->length() == 8);
3714   match(Set dst (ReplicateI zero));
3715   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
3716   ins_encode %{
3717     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3718     int vector_len = 1;
3719     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3720   %}
3721   ins_pipe( fpu_reg_reg );
3722 %}
3723 
3724 // Replicate long (8 byte) scalar to be vector
3725 #ifdef _LP64
3726 instruct Repl2L(vecX dst, rRegL src) %{
3727   predicate(n->as_Vector()->length() == 2);
3728   match(Set dst (ReplicateL src));
3729   format %{ "movdq   $dst,$src\n\t"
3730             "punpcklqdq $dst,$dst\t! replicate2L" %}
3731   ins_encode %{
3732     __ movdq($dst$$XMMRegister, $src$$Register);
3733     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3734   %}
3735   ins_pipe( pipe_slow );
3736 %}
3737 #else // _LP64
3738 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
3739   predicate(n->as_Vector()->length() == 2);
3740   match(Set dst (ReplicateL src));
3741   effect(TEMP dst, USE src, TEMP tmp);
3742   format %{ "movdl   $dst,$src.lo\n\t"
3743             "movdl   $tmp,$src.hi\n\t"
3744             "punpckldq $dst,$tmp\n\t"
3745             "punpcklqdq $dst,$dst\t! replicate2L"%}
3746   ins_encode %{
3747     __ movdl($dst$$XMMRegister, $src$$Register);
3748     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3749     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3750     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3751   %}
3752   ins_pipe( pipe_slow );
3753 %}
3754 #endif // _LP64
3755 
3756 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
3757 instruct Repl2L_imm(vecX dst, immL con) %{
3758   predicate(n->as_Vector()->length() == 2);
3759   match(Set dst (ReplicateL con));
3760   format %{ "movq    $dst,[$constantaddress]\n\t"
3761             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
3762   ins_encode %{
3763     __ movq($dst$$XMMRegister, $constantaddress($con));
3764     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3765   %}
3766   ins_pipe( pipe_slow );
3767 %}
3768 
3769 // Replicate long (8 byte) scalar zero to be vector
3770 instruct Repl2L_zero(vecX dst, immL0 zero) %{
3771   predicate(n->as_Vector()->length() == 2);
3772   match(Set dst (ReplicateL zero));
3773   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
3774   ins_encode %{
3775     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3776   %}
3777   ins_pipe( fpu_reg_reg );
3778 %}
3779 
3780 instruct Repl4L_zero(vecY dst, immL0 zero) %{
3781   predicate(n->as_Vector()->length() == 4);
3782   match(Set dst (ReplicateL zero));
3783   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
3784   ins_encode %{
3785     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3786     int vector_len = 1;
3787     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3788   %}
3789   ins_pipe( fpu_reg_reg );
3790 %}
3791 
3792 // Replicate float (4 byte) scalar to be vector
3793 instruct Repl2F(vecD dst, regF src) %{
3794   predicate(n->as_Vector()->length() == 2);
3795   match(Set dst (ReplicateF src));
3796   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
3797   ins_encode %{
3798     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3799   %}
3800   ins_pipe( fpu_reg_reg );
3801 %}
3802 
3803 instruct Repl4F(vecX dst, regF src) %{
3804   predicate(n->as_Vector()->length() == 4);
3805   match(Set dst (ReplicateF src));
3806   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
3807   ins_encode %{
3808     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3809   %}
3810   ins_pipe( pipe_slow );
3811 %}
3812 
3813 // Replicate double (8 bytes) scalar to be vector
3814 instruct Repl2D(vecX dst, regD src) %{
3815   predicate(n->as_Vector()->length() == 2);
3816   match(Set dst (ReplicateD src));
3817   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
3818   ins_encode %{
3819     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3820   %}
3821   ins_pipe( pipe_slow );
3822 %}
3823 
3824 // ====================EVEX REPLICATE=============================================
3825 
3826 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
3827   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
3828   match(Set dst (ReplicateB (LoadB mem)));
3829   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
3830   ins_encode %{
3831     int vector_len = 0;
3832     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3833   %}
3834   ins_pipe( pipe_slow );
3835 %}
3836 
3837 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
3838   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3839   match(Set dst (ReplicateB (LoadB mem)));
3840   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
3841   ins_encode %{
3842     int vector_len = 0;
3843     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3844   %}
3845   ins_pipe( pipe_slow );
3846 %}
3847 
3848 instruct Repl16B_evex(vecX dst, rRegI src) %{
3849   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3850   match(Set dst (ReplicateB src));
3851   format %{ "vpbroadcastb $dst,$src\t! replicate16B" %}
3852   ins_encode %{
3853    int vector_len = 0;
3854     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3855   %}
3856   ins_pipe( pipe_slow );
3857 %}
3858 
3859 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
3860   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3861   match(Set dst (ReplicateB (LoadB mem)));
3862   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
3863   ins_encode %{
3864     int vector_len = 0;
3865     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3866   %}
3867   ins_pipe( pipe_slow );
3868 %}
3869 
3870 instruct Repl32B_evex(vecY dst, rRegI src) %{
3871   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3872   match(Set dst (ReplicateB src));
3873   format %{ "vpbroadcastb $dst,$src\t! replicate32B" %}
3874   ins_encode %{
3875    int vector_len = 1;
3876     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3877   %}
3878   ins_pipe( pipe_slow );
3879 %}
3880 
3881 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
3882   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3883   match(Set dst (ReplicateB (LoadB mem)));
3884   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
3885   ins_encode %{
3886     int vector_len = 1;
3887     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3888   %}
3889   ins_pipe( pipe_slow );
3890 %}
3891 
3892 instruct Repl64B_evex(vecZ dst, rRegI src) %{
3893   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
3894   match(Set dst (ReplicateB src));
3895   format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
3896   ins_encode %{
3897    int vector_len = 2;
3898     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
3899   %}
3900   ins_pipe( pipe_slow );
3901 %}
3902 
3903 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
3904   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
3905   match(Set dst (ReplicateB (LoadB mem)));
3906   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
3907   ins_encode %{
3908     int vector_len = 2;
3909     __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
3910   %}
3911   ins_pipe( pipe_slow );
3912 %}
3913 
3914 instruct Repl16B_imm_evex(vecX dst, immI con) %{
3915   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
3916   match(Set dst (ReplicateB con));
3917   format %{ "movq    $dst,[$constantaddress]\n\t"
3918             "vpbroadcastb $dst,$dst\t! replicate16B" %}
3919   ins_encode %{
3920    int vector_len = 0;
3921     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3922     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3923   %}
3924   ins_pipe( pipe_slow );
3925 %}
3926 
3927 instruct Repl32B_imm_evex(vecY dst, immI con) %{
3928   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
3929   match(Set dst (ReplicateB con));
3930   format %{ "movq    $dst,[$constantaddress]\n\t"
3931             "vpbroadcastb $dst,$dst\t! replicate32B" %}
3932   ins_encode %{
3933    int vector_len = 1;
3934     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3935     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3936   %}
3937   ins_pipe( pipe_slow );
3938 %}
3939 
3940 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
3941   predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512bw());
3942   match(Set dst (ReplicateB con));
3943   format %{ "movq    $dst,[$constantaddress]\n\t"
3944             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
3945   ins_encode %{
3946    int vector_len = 2;
3947     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3948     __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3949   %}
3950   ins_pipe( pipe_slow );
3951 %}
3952 
3953 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
3954   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
3955   match(Set dst (ReplicateB zero));
3956   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
3957   ins_encode %{
3958     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
3959     int vector_len = 2;
3960     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3961   %}
3962   ins_pipe( fpu_reg_reg );
3963 %}
3964 
3965 instruct Repl4S_evex(vecD dst, rRegI src) %{
3966   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
3967   match(Set dst (ReplicateS src));
3968   format %{ "vpbroadcastw $dst,$src\t! replicate4S" %}
3969   ins_encode %{
3970    int vector_len = 0;
3971     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3972   %}
3973   ins_pipe( pipe_slow );
3974 %}
3975 
3976 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
3977   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
3978   match(Set dst (ReplicateS (LoadS mem)));
3979   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
3980   ins_encode %{
3981     int vector_len = 0;
3982     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
3983   %}
3984   ins_pipe( pipe_slow );
3985 %}
3986 
3987 instruct Repl8S_evex(vecX dst, rRegI src) %{
3988   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
3989   match(Set dst (ReplicateS src));
3990   format %{ "vpbroadcastw $dst,$src\t! replicate8S" %}
3991   ins_encode %{
3992    int vector_len = 0;
3993     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
3994   %}
3995   ins_pipe( pipe_slow );
3996 %}
3997 
3998 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
3999   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
4000   match(Set dst (ReplicateS (LoadS mem)));
4001   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
4002   ins_encode %{
4003     int vector_len = 0;
4004     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4005   %}
4006   ins_pipe( pipe_slow );
4007 %}
4008 
4009 instruct Repl16S_evex(vecY dst, rRegI src) %{
4010   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4011   match(Set dst (ReplicateS src));
4012   format %{ "vpbroadcastw $dst,$src\t! replicate16S" %}
4013   ins_encode %{
4014    int vector_len = 1;
4015     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4016   %}
4017   ins_pipe( pipe_slow );
4018 %}
4019 
4020 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
4021   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4022   match(Set dst (ReplicateS (LoadS mem)));
4023   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
4024   ins_encode %{
4025     int vector_len = 1;
4026     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4027   %}
4028   ins_pipe( pipe_slow );
4029 %}
4030 
4031 instruct Repl32S_evex(vecZ dst, rRegI src) %{
4032   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
4033   match(Set dst (ReplicateS src));
4034   format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
4035   ins_encode %{
4036    int vector_len = 2;
4037     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4038   %}
4039   ins_pipe( pipe_slow );
4040 %}
4041 
4042 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
4043   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
4044   match(Set dst (ReplicateS (LoadS mem)));
4045   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
4046   ins_encode %{
4047     int vector_len = 2;
4048     __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4049   %}
4050   ins_pipe( pipe_slow );
4051 %}
4052 
4053 instruct Repl8S_imm_evex(vecX dst, immI con) %{
4054   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
4055   match(Set dst (ReplicateS con));
4056   format %{ "movq    $dst,[$constantaddress]\n\t"
4057             "vpbroadcastw $dst,$dst\t! replicate8S" %}
4058   ins_encode %{
4059    int vector_len = 0;
4060     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4061     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4062   %}
4063   ins_pipe( pipe_slow );
4064 %}
4065 
4066 instruct Repl16S_imm_evex(vecY dst, immI con) %{
4067   predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
4068   match(Set dst (ReplicateS con));
4069   format %{ "movq    $dst,[$constantaddress]\n\t"
4070             "vpbroadcastw $dst,$dst\t! replicate16S" %}
4071   ins_encode %{
4072    int vector_len = 1;
4073     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4074     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4075   %}
4076   ins_pipe( pipe_slow );
4077 %}
4078 
4079 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
4080   predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512bw());
4081   match(Set dst (ReplicateS con));
4082   format %{ "movq    $dst,[$constantaddress]\n\t"
4083             "vpbroadcastw $dst,$dst\t! replicate32S" %}
4084   ins_encode %{
4085    int vector_len = 2;
4086     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4087     __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4088   %}
4089   ins_pipe( pipe_slow );
4090 %}
4091 
4092 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
4093   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
4094   match(Set dst (ReplicateS zero));
4095   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
4096   ins_encode %{
4097     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4098     int vector_len = 2;
4099     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4100   %}
4101   ins_pipe( fpu_reg_reg );
4102 %}
4103 
4104 instruct Repl4I_evex(vecX dst, rRegI src) %{
4105   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4106   match(Set dst (ReplicateI src));
4107   format %{ "vpbroadcastd  $dst,$src\t! replicate4I" %}
4108   ins_encode %{
4109     int vector_len = 0;
4110     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4111   %}
4112   ins_pipe( pipe_slow );
4113 %}
4114 
4115 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
4116   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4117   match(Set dst (ReplicateI (LoadI mem)));
4118   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
4119   ins_encode %{
4120     int vector_len = 0;
4121     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4122   %}
4123   ins_pipe( pipe_slow );
4124 %}
4125 
4126 instruct Repl8I_evex(vecY dst, rRegI src) %{
4127   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4128   match(Set dst (ReplicateI src));
4129   format %{ "vpbroadcastd  $dst,$src\t! replicate8I" %}
4130   ins_encode %{
4131     int vector_len = 1;
4132     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4133   %}
4134   ins_pipe( pipe_slow );
4135 %}
4136 
4137 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
4138   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4139   match(Set dst (ReplicateI (LoadI mem)));
4140   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
4141   ins_encode %{
4142     int vector_len = 1;
4143     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4144   %}
4145   ins_pipe( pipe_slow );
4146 %}
4147 
4148 instruct Repl16I_evex(vecZ dst, rRegI src) %{
4149   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4150   match(Set dst (ReplicateI src));
4151   format %{ "vpbroadcastd  $dst,$src\t! replicate16I" %}
4152   ins_encode %{
4153     int vector_len = 2;
4154     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4155   %}
4156   ins_pipe( pipe_slow );
4157 %}
4158 
4159 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
4160   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4161   match(Set dst (ReplicateI (LoadI mem)));
4162   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4163   ins_encode %{
4164     int vector_len = 2;
4165     __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4166   %}
4167   ins_pipe( pipe_slow );
4168 %}
4169 
4170 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4171   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4172   match(Set dst (ReplicateI con));
4173   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4174             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4175   ins_encode %{
4176     int vector_len = 0;
4177     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4178     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4179   %}
4180   ins_pipe( pipe_slow );
4181 %}
4182 
4183 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4184   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4185   match(Set dst (ReplicateI con));
4186   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4187             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4188   ins_encode %{
4189     int vector_len = 1;
4190     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4191     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4192   %}
4193   ins_pipe( pipe_slow );
4194 %}
4195 
4196 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4197   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4198   match(Set dst (ReplicateI con));
4199   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4200             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4201   ins_encode %{
4202     int vector_len = 2;
4203     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4204     __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4205   %}
4206   ins_pipe( pipe_slow );
4207 %}
4208 
4209 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4210   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4211   match(Set dst (ReplicateI zero));
4212   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4213   ins_encode %{
4214     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4215     int vector_len = 2;
4216     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4217   %}
4218   ins_pipe( fpu_reg_reg );
4219 %}
4220 
4221 // Replicate long (8 byte) scalar to be vector
4222 #ifdef _LP64
4223 instruct Repl4L_evex(vecY dst, rRegL src) %{
4224   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4225   match(Set dst (ReplicateL src));
4226   format %{ "vpbroadcastq  $dst,$src\t! replicate4L" %}
4227   ins_encode %{
4228     int vector_len = 1;
4229     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4230   %}
4231   ins_pipe( pipe_slow );
4232 %}
4233 
4234 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4235   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4236   match(Set dst (ReplicateL src));
4237   format %{ "vpbroadcastq  $dst,$src\t! replicate8L" %}
4238   ins_encode %{
4239     int vector_len = 2;
4240     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4241   %}
4242   ins_pipe( pipe_slow );
4243 %}
4244 #else // _LP64
4245 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4246   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4247   match(Set dst (ReplicateL src));
4248   effect(TEMP dst, USE src, TEMP tmp);
4249   format %{ "movdl   $dst,$src.lo\n\t"
4250             "movdl   $tmp,$src.hi\n\t"
4251             "punpckldq $dst,$tmp\n\t"
4252             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4253   ins_encode %{
4254     int vector_len = 1;
4255     __ movdl($dst$$XMMRegister, $src$$Register);
4256     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4257     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4258     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4259   %}
4260   ins_pipe( pipe_slow );
4261 %}
4262 
4263 instruct Repl8L_evex(vecZ dst, eRegL src, regD tmp) %{
4264   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4265   match(Set dst (ReplicateL src));
4266   effect(TEMP dst, USE src, TEMP tmp);
4267   format %{ "movdl   $dst,$src.lo\n\t"
4268             "movdl   $tmp,$src.hi\n\t"
4269             "punpckldq $dst,$tmp\n\t"
4270             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4271   ins_encode %{
4272     int vector_len = 2;
4273     __ movdl($dst$$XMMRegister, $src$$Register);
4274     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4275     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4276     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4277   %}
4278   ins_pipe( pipe_slow );
4279 %}
4280 #endif // _LP64
4281 
4282 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4283   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4284   match(Set dst (ReplicateL con));
4285   format %{ "movq    $dst,[$constantaddress]\n\t"
4286             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4287   ins_encode %{
4288     int vector_len = 1;
4289     __ movq($dst$$XMMRegister, $constantaddress($con));
4290     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4291   %}
4292   ins_pipe( pipe_slow );
4293 %}
4294 
4295 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4296   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4297   match(Set dst (ReplicateL con));
4298   format %{ "movq    $dst,[$constantaddress]\n\t"
4299             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4300   ins_encode %{
4301     int vector_len = 2;
4302     __ movq($dst$$XMMRegister, $constantaddress($con));
4303     __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4304   %}
4305   ins_pipe( pipe_slow );
4306 %}
4307 
4308 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
4309   predicate(n->as_Vector()->length() == 2 && VM_Version::supports_avx512vl());
4310   match(Set dst (ReplicateL (LoadL mem)));
4311   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4312   ins_encode %{
4313     int vector_len = 0;
4314     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4315   %}
4316   ins_pipe( pipe_slow );
4317 %}
4318 
4319 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4320   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4321   match(Set dst (ReplicateL (LoadL mem)));
4322   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4323   ins_encode %{
4324     int vector_len = 1;
4325     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4326   %}
4327   ins_pipe( pipe_slow );
4328 %}
4329 
4330 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
4331   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4332   match(Set dst (ReplicateL (LoadL mem)));
4333   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
4334   ins_encode %{
4335     int vector_len = 2;
4336     __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4337   %}
4338   ins_pipe( pipe_slow );
4339 %}
4340 
4341 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
4342   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4343   match(Set dst (ReplicateL zero));
4344   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4345   ins_encode %{
4346     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4347     int vector_len = 2;
4348     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4349   %}
4350   ins_pipe( fpu_reg_reg );
4351 %}
4352 
4353 instruct Repl8F_evex(vecY dst, regF src) %{
4354   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4355   match(Set dst (ReplicateF src));
4356   format %{ "vbroadcastss $dst,$src\t! replicate8F" %}
4357   ins_encode %{
4358     int vector_len = 1;
4359     __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4360   %}
4361   ins_pipe( pipe_slow );
4362 %}
4363 
4364 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
4365   predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
4366   match(Set dst (ReplicateF (LoadF mem)));
4367   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
4368   ins_encode %{
4369     int vector_len = 1;
4370     __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4371   %}
4372   ins_pipe( pipe_slow );
4373 %}
4374 
4375 instruct Repl16F_evex(vecZ dst, regF src) %{
4376   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4377   match(Set dst (ReplicateF src));
4378   format %{ "vbroadcastss $dst,$src\t! replicate16F" %}
4379   ins_encode %{
4380     int vector_len = 2;
4381     __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4382   %}
4383   ins_pipe( pipe_slow );
4384 %}
4385 
4386 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
4387   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4388   match(Set dst (ReplicateF (LoadF mem)));
4389   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
4390   ins_encode %{
4391     int vector_len = 2;
4392     __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4393   %}
4394   ins_pipe( pipe_slow );
4395 %}
4396 
4397 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
4398   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4399   match(Set dst (ReplicateF zero));
4400   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
4401   ins_encode %{
4402     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4403     int vector_len = 2;
4404     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4405   %}
4406   ins_pipe( fpu_reg_reg );
4407 %}
4408 
4409 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
4410   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4411   match(Set dst (ReplicateF zero));
4412   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
4413   ins_encode %{
4414     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4415     int vector_len = 2;
4416     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4417   %}
4418   ins_pipe( fpu_reg_reg );
4419 %}
4420 
4421 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
4422   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4423   match(Set dst (ReplicateF zero));
4424   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
4425   ins_encode %{
4426     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4427     int vector_len = 2;
4428     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4429   %}
4430   ins_pipe( fpu_reg_reg );
4431 %}
4432 
4433 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
4434   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4435   match(Set dst (ReplicateF zero));
4436   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
4437   ins_encode %{
4438     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4439     int vector_len = 2;
4440     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4441   %}
4442   ins_pipe( fpu_reg_reg );
4443 %}
4444 
4445 instruct Repl4D_evex(vecY dst, regD src) %{
4446   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4447   match(Set dst (ReplicateD src));
4448   format %{ "vbroadcastsd $dst,$src\t! replicate4D" %}
4449   ins_encode %{
4450     int vector_len = 1;
4451     __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4452   %}
4453   ins_pipe( pipe_slow );
4454 %}
4455 
4456 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
4457   predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
4458   match(Set dst (ReplicateD (LoadD mem)));
4459   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
4460   ins_encode %{
4461     int vector_len = 1;
4462     __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4463   %}
4464   ins_pipe( pipe_slow );
4465 %}
4466 
4467 instruct Repl8D_evex(vecZ dst, regD src) %{
4468   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4469   match(Set dst (ReplicateD src));
4470   format %{ "vbroadcastsd $dst,$src\t! replicate8D" %}
4471   ins_encode %{
4472     int vector_len = 2;
4473     __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4474   %}
4475   ins_pipe( pipe_slow );
4476 %}
4477 
4478 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
4479   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4480   match(Set dst (ReplicateD (LoadD mem)));
4481   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
4482   ins_encode %{
4483     int vector_len = 2;
4484     __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4485   %}
4486   ins_pipe( pipe_slow );
4487 %}
4488 
4489 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
4490   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4491   match(Set dst (ReplicateD zero));
4492   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
4493   ins_encode %{
4494     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4495     int vector_len = 2;
4496     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4497   %}
4498   ins_pipe( fpu_reg_reg );
4499 %}
4500 
4501 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
4502   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4503   match(Set dst (ReplicateD zero));
4504   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
4505   ins_encode %{
4506     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4507     int vector_len = 2;
4508     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4509   %}
4510   ins_pipe( fpu_reg_reg );
4511 %}
4512 
4513 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
4514   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4515   match(Set dst (ReplicateD zero));
4516   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
4517   ins_encode %{
4518     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4519     int vector_len = 2;
4520     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4521   %}
4522   ins_pipe( fpu_reg_reg );
4523 %}
4524 
4525 // ====================REDUCTION ARITHMETIC=======================================
4526 
4527 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4528   predicate(UseSSE > 2 && UseAVX == 0);
4529   match(Set dst (AddReductionVI src1 src2));
4530   effect(TEMP tmp2, TEMP tmp);
4531   format %{ "movdqu  $tmp2,$src2\n\t"
4532             "phaddd  $tmp2,$tmp2\n\t"
4533             "movd    $tmp,$src1\n\t"
4534             "paddd   $tmp,$tmp2\n\t"
4535             "movd    $dst,$tmp\t! add reduction2I" %}
4536   ins_encode %{
4537     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4538     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4539     __ movdl($tmp$$XMMRegister, $src1$$Register);
4540     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4541     __ movdl($dst$$Register, $tmp$$XMMRegister);
4542   %}
4543   ins_pipe( pipe_slow );
4544 %}
4545 
4546 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4547   predicate(VM_Version::supports_avxonly());
4548   match(Set dst (AddReductionVI src1 src2));
4549   effect(TEMP tmp, TEMP tmp2);
4550   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4551             "movd     $tmp2,$src1\n\t"
4552             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4553             "movd     $dst,$tmp2\t! add reduction2I" %}
4554   ins_encode %{
4555     int vector_len = 0;
4556     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4557     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4558     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4559     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4560   %}
4561   ins_pipe( pipe_slow );
4562 %}
4563 
4564 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
4565   predicate(UseAVX > 2);
4566   match(Set dst (AddReductionVI src1 src2));
4567   effect(TEMP tmp, TEMP tmp2);
4568   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4569             "vpaddd  $tmp,$src2,$tmp2\n\t"
4570             "movd    $tmp2,$src1\n\t"
4571             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4572             "movd    $dst,$tmp2\t! add reduction2I" %}
4573   ins_encode %{
4574     int vector_len = 0;
4575     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4576     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4577     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4578     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4579     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4580   %}
4581   ins_pipe( pipe_slow );
4582 %}
4583 
4584 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4585   predicate(UseSSE > 2 && UseAVX == 0);
4586   match(Set dst (AddReductionVI src1 src2));
4587   effect(TEMP tmp, TEMP tmp2);
4588   format %{ "movdqu  $tmp,$src2\n\t"
4589             "phaddd  $tmp,$tmp\n\t"
4590             "phaddd  $tmp,$tmp\n\t"
4591             "movd    $tmp2,$src1\n\t"
4592             "paddd   $tmp2,$tmp\n\t"
4593             "movd    $dst,$tmp2\t! add reduction4I" %}
4594   ins_encode %{
4595     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
4596     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4597     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4598     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4599     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
4600     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4601   %}
4602   ins_pipe( pipe_slow );
4603 %}
4604 
4605 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4606   predicate(VM_Version::supports_avxonly());
4607   match(Set dst (AddReductionVI src1 src2));
4608   effect(TEMP tmp, TEMP tmp2);
4609   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4610             "vphaddd  $tmp,$tmp,$tmp\n\t"
4611             "movd     $tmp2,$src1\n\t"
4612             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4613             "movd     $dst,$tmp2\t! add reduction4I" %}
4614   ins_encode %{
4615     int vector_len = 0;
4616     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4617     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
4618     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4619     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4620     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4621   %}
4622   ins_pipe( pipe_slow );
4623 %}
4624 
4625 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
4626   predicate(UseAVX > 2);
4627   match(Set dst (AddReductionVI src1 src2));
4628   effect(TEMP tmp, TEMP tmp2);
4629   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4630             "vpaddd  $tmp,$src2,$tmp2\n\t"
4631             "pshufd  $tmp2,$tmp,0x1\n\t"
4632             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4633             "movd    $tmp2,$src1\n\t"
4634             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4635             "movd    $dst,$tmp2\t! add reduction4I" %}
4636   ins_encode %{
4637     int vector_len = 0;
4638     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4639     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4640     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4641     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4642     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4643     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4644     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4645   %}
4646   ins_pipe( pipe_slow );
4647 %}
4648 
4649 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4650   predicate(VM_Version::supports_avxonly());
4651   match(Set dst (AddReductionVI src1 src2));
4652   effect(TEMP tmp, TEMP tmp2);
4653   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4654             "vphaddd  $tmp,$tmp,$tmp2\n\t"
4655             "vextracti128_high  $tmp2,$tmp\n\t"
4656             "vpaddd   $tmp,$tmp,$tmp2\n\t"
4657             "movd     $tmp2,$src1\n\t"
4658             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4659             "movd     $dst,$tmp2\t! add reduction8I" %}
4660   ins_encode %{
4661     int vector_len = 1;
4662     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4663     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4664     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
4665     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4666     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4667     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4668     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4669   %}
4670   ins_pipe( pipe_slow );
4671 %}
4672 
4673 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
4674   predicate(UseAVX > 2);
4675   match(Set dst (AddReductionVI src1 src2));
4676   effect(TEMP tmp, TEMP tmp2);
4677   format %{ "vextracti128_high  $tmp,$src2\n\t"
4678             "vpaddd  $tmp,$tmp,$src2\n\t"
4679             "pshufd  $tmp2,$tmp,0xE\n\t"
4680             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4681             "pshufd  $tmp2,$tmp,0x1\n\t"
4682             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4683             "movd    $tmp2,$src1\n\t"
4684             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4685             "movd    $dst,$tmp2\t! add reduction8I" %}
4686   ins_encode %{
4687     int vector_len = 0;
4688     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
4689     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
4690     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4691     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4692     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4693     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4694     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4695     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4696     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4697   %}
4698   ins_pipe( pipe_slow );
4699 %}
4700 
4701 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
4702   predicate(UseAVX > 2);
4703   match(Set dst (AddReductionVI src1 src2));
4704   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
4705   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
4706             "vpaddd  $tmp3,$tmp3,$src2\n\t"
4707             "vextracti128_high  $tmp,$tmp3\n\t"
4708             "vpaddd  $tmp,$tmp,$tmp3\n\t"
4709             "pshufd  $tmp2,$tmp,0xE\n\t"
4710             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4711             "pshufd  $tmp2,$tmp,0x1\n\t"
4712             "vpaddd  $tmp,$tmp,$tmp2\n\t"
4713             "movd    $tmp2,$src1\n\t"
4714             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4715             "movd    $dst,$tmp2\t! mul reduction16I" %}
4716   ins_encode %{
4717     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
4718     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
4719     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
4720     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
4721     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
4722     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4723     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
4724     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4725     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4726     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4727     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4728   %}
4729   ins_pipe( pipe_slow );
4730 %}
4731 
4732 #ifdef _LP64
4733 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
4734   predicate(UseAVX > 2);
4735   match(Set dst (AddReductionVL src1 src2));
4736   effect(TEMP tmp, TEMP tmp2);
4737   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
4738             "vpaddq  $tmp,$src2,$tmp2\n\t"
4739             "movdq   $tmp2,$src1\n\t"
4740             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
4741             "movdq   $dst,$tmp2\t! add reduction2L" %}
4742   ins_encode %{
4743     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
4744     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
4745     __ movdq($tmp2$$XMMRegister, $src1$$Register);
4746     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
4747     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4748   %}
4749   ins_pipe( pipe_slow );
4750 %}
4751 
4752 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
4753   predicate(UseAVX > 2);
4754   match(Set dst (AddReductionVL src1 src2));
4755   effect(TEMP tmp, TEMP tmp2);
4756   format %{ "vextracti128_high  $tmp,$src2\n\t"
4757             "vpaddq  $tmp2,$tmp,$src2\n\t"
4758             "pshufd  $tmp,$tmp2,0xE\n\t"
4759             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4760             "movdq   $tmp,$src1\n\t"
4761             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4762             "movdq   $dst,$tmp2\t! add reduction4L" %}
4763   ins_encode %{
4764     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
4765     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
4766     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4767     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4768     __ movdq($tmp$$XMMRegister, $src1$$Register);
4769     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4770     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4771   %}
4772   ins_pipe( pipe_slow );
4773 %}
4774 
4775 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
4776   predicate(UseAVX > 2);
4777   match(Set dst (AddReductionVL src1 src2));
4778   effect(TEMP tmp, TEMP tmp2);
4779   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
4780             "vpaddq  $tmp2,$tmp2,$src2\n\t"
4781             "vextracti128_high  $tmp,$tmp2\n\t"
4782             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4783             "pshufd  $tmp,$tmp2,0xE\n\t"
4784             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4785             "movdq   $tmp,$src1\n\t"
4786             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
4787             "movdq   $dst,$tmp2\t! add reduction8L" %}
4788   ins_encode %{
4789     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
4790     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
4791     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
4792     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4793     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
4794     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4795     __ movdq($tmp$$XMMRegister, $src1$$Register);
4796     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
4797     __ movdq($dst$$Register, $tmp2$$XMMRegister);
4798   %}
4799   ins_pipe( pipe_slow );
4800 %}
4801 #endif
4802 
4803 instruct rsadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4804   predicate(UseSSE >= 1 && UseAVX == 0);
4805   match(Set dst (AddReductionVF dst src2));
4806   effect(TEMP dst, TEMP tmp);
4807   format %{ "addss   $dst,$src2\n\t"
4808             "pshufd  $tmp,$src2,0x01\n\t"
4809             "addss   $dst,$tmp\t! add reduction2F" %}
4810   ins_encode %{
4811     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
4812     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4813     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4814   %}
4815   ins_pipe( pipe_slow );
4816 %}
4817 
4818 instruct rvadd2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
4819   predicate(UseAVX > 0);
4820   match(Set dst (AddReductionVF dst src2));
4821   effect(TEMP dst, TEMP tmp);
4822   format %{ "vaddss  $dst,$dst,$src2\n\t"
4823             "pshufd  $tmp,$src2,0x01\n\t"
4824             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
4825   ins_encode %{
4826     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4827     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4828     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4829   %}
4830   ins_pipe( pipe_slow );
4831 %}
4832 
4833 instruct rsadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
4834   predicate(UseSSE >= 1 && UseAVX == 0);
4835   match(Set dst (AddReductionVF dst src2));
4836   effect(TEMP dst, TEMP tmp);
4837   format %{ "addss   $dst,$src2\n\t"
4838             "pshufd  $tmp,$src2,0x01\n\t"
4839             "addss   $dst,$tmp\n\t"
4840             "pshufd  $tmp,$src2,0x02\n\t"
4841             "addss   $dst,$tmp\n\t"
4842             "pshufd  $tmp,$src2,0x03\n\t"
4843             "addss   $dst,$tmp\t! add reduction4F" %}
4844   ins_encode %{
4845     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
4846     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4847     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4848     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4849     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4850     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4851     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
4852   %}
4853   ins_pipe( pipe_slow );
4854 %}
4855 
4856 instruct rvadd4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
4857   predicate(UseAVX > 0);
4858   match(Set dst (AddReductionVF dst src2));
4859   effect(TEMP tmp, TEMP dst);
4860   format %{ "vaddss  $dst,dst,$src2\n\t"
4861             "pshufd  $tmp,$src2,0x01\n\t"
4862             "vaddss  $dst,$dst,$tmp\n\t"
4863             "pshufd  $tmp,$src2,0x02\n\t"
4864             "vaddss  $dst,$dst,$tmp\n\t"
4865             "pshufd  $tmp,$src2,0x03\n\t"
4866             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
4867   ins_encode %{
4868     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4869     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4870     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4871     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4872     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4873     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4874     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4875   %}
4876   ins_pipe( pipe_slow );
4877 %}
4878 
4879 instruct radd8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
4880   predicate(UseAVX > 0);
4881   match(Set dst (AddReductionVF dst src2));
4882   effect(TEMP tmp, TEMP dst, TEMP tmp2);
4883   format %{ "vaddss  $dst,$dst,$src2\n\t"
4884             "pshufd  $tmp,$src2,0x01\n\t"
4885             "vaddss  $dst,$dst,$tmp\n\t"
4886             "pshufd  $tmp,$src2,0x02\n\t"
4887             "vaddss  $dst,$dst,$tmp\n\t"
4888             "pshufd  $tmp,$src2,0x03\n\t"
4889             "vaddss  $dst,$dst,$tmp\n\t"
4890             "vextractf128_high  $tmp2,$src2\n\t"
4891             "vaddss  $dst,$dst,$tmp2\n\t"
4892             "pshufd  $tmp,$tmp2,0x01\n\t"
4893             "vaddss  $dst,$dst,$tmp\n\t"
4894             "pshufd  $tmp,$tmp2,0x02\n\t"
4895             "vaddss  $dst,$dst,$tmp\n\t"
4896             "pshufd  $tmp,$tmp2,0x03\n\t"
4897             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
4898   ins_encode %{
4899     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4900     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4901     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4902     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4903     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4904     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4905     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4906     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
4907     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4908     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4909     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4910     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4911     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4912     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4913     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4914   %}
4915   ins_pipe( pipe_slow );
4916 %}
4917 
4918 instruct radd16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
4919   predicate(UseAVX > 2);
4920   match(Set dst (AddReductionVF dst src2));
4921   effect(TEMP tmp, TEMP dst, TEMP tmp2);
4922   format %{ "vaddss  $dst,$dst,$src2\n\t"
4923             "pshufd  $tmp,$src2,0x01\n\t"
4924             "vaddss  $dst,$dst,$tmp\n\t"
4925             "pshufd  $tmp,$src2,0x02\n\t"
4926             "vaddss  $dst,$dst,$tmp\n\t"
4927             "pshufd  $tmp,$src2,0x03\n\t"
4928             "vaddss  $dst,$dst,$tmp\n\t"
4929             "vextractf32x4  $tmp2,$src2,0x1\n\t"
4930             "vaddss  $dst,$dst,$tmp2\n\t"
4931             "pshufd  $tmp,$tmp2,0x01\n\t"
4932             "vaddss  $dst,$dst,$tmp\n\t"
4933             "pshufd  $tmp,$tmp2,0x02\n\t"
4934             "vaddss  $dst,$dst,$tmp\n\t"
4935             "pshufd  $tmp,$tmp2,0x03\n\t"
4936             "vaddss  $dst,$dst,$tmp\n\t"
4937             "vextractf32x4  $tmp2,$src2,0x2\n\t"
4938             "vaddss  $dst,$dst,$tmp2\n\t"
4939             "pshufd  $tmp,$tmp2,0x01\n\t"
4940             "vaddss  $dst,$dst,$tmp\n\t"
4941             "pshufd  $tmp,$tmp2,0x02\n\t"
4942             "vaddss  $dst,$dst,$tmp\n\t"
4943             "pshufd  $tmp,$tmp2,0x03\n\t"
4944             "vaddss  $dst,$dst,$tmp\n\t"
4945             "vextractf32x4  $tmp2,$src2,0x3\n\t"
4946             "vaddss  $dst,$dst,$tmp2\n\t"
4947             "pshufd  $tmp,$tmp2,0x01\n\t"
4948             "vaddss  $dst,$dst,$tmp\n\t"
4949             "pshufd  $tmp,$tmp2,0x02\n\t"
4950             "vaddss  $dst,$dst,$tmp\n\t"
4951             "pshufd  $tmp,$tmp2,0x03\n\t"
4952             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
4953   ins_encode %{
4954     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
4955     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
4956     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4957     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
4958     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4959     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
4960     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4961     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4962     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4963     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4964     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4965     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4966     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4967     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4968     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4969     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
4970     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4971     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4972     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4973     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4974     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4975     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4976     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4977     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
4978     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
4979     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
4980     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4981     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
4982     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4983     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
4984     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
4985   %}
4986   ins_pipe( pipe_slow );
4987 %}
4988 
4989 instruct rsadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
4990   predicate(UseSSE >= 1 && UseAVX == 0);
4991   match(Set dst (AddReductionVD dst src2));
4992   effect(TEMP tmp, TEMP dst);
4993   format %{ "addsd   $dst,$src2\n\t"
4994             "pshufd  $tmp,$src2,0xE\n\t"
4995             "addsd   $dst,$tmp\t! add reduction2D" %}
4996   ins_encode %{
4997     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
4998     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
4999     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
5000   %}
5001   ins_pipe( pipe_slow );
5002 %}
5003 
5004 instruct rvadd2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5005   predicate(UseAVX > 0);
5006   match(Set dst (AddReductionVD dst src2));
5007   effect(TEMP tmp, TEMP dst);
5008   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5009             "pshufd  $tmp,$src2,0xE\n\t"
5010             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5011   ins_encode %{
5012     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5013     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5014     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5015   %}
5016   ins_pipe( pipe_slow );
5017 %}
5018 
5019 instruct rvadd4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5020   predicate(UseAVX > 0);
5021   match(Set dst (AddReductionVD dst src2));
5022   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5023   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5024             "pshufd  $tmp,$src2,0xE\n\t"
5025             "vaddsd  $dst,$dst,$tmp\n\t"
5026             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5027             "vaddsd  $dst,$dst,$tmp2\n\t"
5028             "pshufd  $tmp,$tmp2,0xE\n\t"
5029             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5030   ins_encode %{
5031     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5032     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5033     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5034     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5035     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5036     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5037     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5038   %}
5039   ins_pipe( pipe_slow );
5040 %}
5041 
5042 instruct rvadd8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5043   predicate(UseAVX > 2);
5044   match(Set dst (AddReductionVD dst src2));
5045   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5046   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5047             "pshufd  $tmp,$src2,0xE\n\t"
5048             "vaddsd  $dst,$dst,$tmp\n\t"
5049             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5050             "vaddsd  $dst,$dst,$tmp2\n\t"
5051             "pshufd  $tmp,$tmp2,0xE\n\t"
5052             "vaddsd  $dst,$dst,$tmp\n\t"
5053             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5054             "vaddsd  $dst,$dst,$tmp2\n\t"
5055             "pshufd  $tmp,$tmp2,0xE\n\t"
5056             "vaddsd  $dst,$dst,$tmp\n\t"
5057             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5058             "vaddsd  $dst,$dst,$tmp2\n\t"
5059             "pshufd  $tmp,$tmp2,0xE\n\t"
5060             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5061   ins_encode %{
5062     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5063     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5064     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5065     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5066     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5067     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5068     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5069     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5070     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5071     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5072     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5073     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5074     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5075     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5076     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5077   %}
5078   ins_pipe( pipe_slow );
5079 %}
5080 
5081 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
5082   predicate(UseSSE > 3 && UseAVX == 0);
5083   match(Set dst (MulReductionVI src1 src2));
5084   effect(TEMP tmp, TEMP tmp2);
5085   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5086             "pmulld  $tmp2,$src2\n\t"
5087             "movd    $tmp,$src1\n\t"
5088             "pmulld  $tmp2,$tmp\n\t"
5089             "movd    $dst,$tmp2\t! mul reduction2I" %}
5090   ins_encode %{
5091     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5092     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5093     __ movdl($tmp$$XMMRegister, $src1$$Register);
5094     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5095     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5096   %}
5097   ins_pipe( pipe_slow );
5098 %}
5099 
5100 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
5101   predicate(UseAVX > 0);
5102   match(Set dst (MulReductionVI src1 src2));
5103   effect(TEMP tmp, TEMP tmp2);
5104   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
5105             "vpmulld  $tmp,$src2,$tmp2\n\t"
5106             "movd     $tmp2,$src1\n\t"
5107             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5108             "movd     $dst,$tmp2\t! mul reduction2I" %}
5109   ins_encode %{
5110     int vector_len = 0;
5111     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5112     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5113     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5114     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5115     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5116   %}
5117   ins_pipe( pipe_slow );
5118 %}
5119 
5120 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
5121   predicate(UseSSE > 3 && UseAVX == 0);
5122   match(Set dst (MulReductionVI src1 src2));
5123   effect(TEMP tmp, TEMP tmp2);
5124   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5125             "pmulld  $tmp2,$src2\n\t"
5126             "pshufd  $tmp,$tmp2,0x1\n\t"
5127             "pmulld  $tmp2,$tmp\n\t"
5128             "movd    $tmp,$src1\n\t"
5129             "pmulld  $tmp2,$tmp\n\t"
5130             "movd    $dst,$tmp2\t! mul reduction4I" %}
5131   ins_encode %{
5132     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5133     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5134     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
5135     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5136     __ movdl($tmp$$XMMRegister, $src1$$Register);
5137     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5138     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5139   %}
5140   ins_pipe( pipe_slow );
5141 %}
5142 
5143 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
5144   predicate(UseAVX > 0);
5145   match(Set dst (MulReductionVI src1 src2));
5146   effect(TEMP tmp, TEMP tmp2);
5147   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5148             "vpmulld  $tmp,$src2,$tmp2\n\t"
5149             "pshufd   $tmp2,$tmp,0x1\n\t"
5150             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5151             "movd     $tmp2,$src1\n\t"
5152             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5153             "movd     $dst,$tmp2\t! mul reduction4I" %}
5154   ins_encode %{
5155     int vector_len = 0;
5156     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5157     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5158     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5159     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5160     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5161     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5162     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5163   %}
5164   ins_pipe( pipe_slow );
5165 %}
5166 
5167 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
5168   predicate(UseAVX > 0);
5169   match(Set dst (MulReductionVI src1 src2));
5170   effect(TEMP tmp, TEMP tmp2);
5171   format %{ "vextracti128_high  $tmp,$src2\n\t"
5172             "vpmulld  $tmp,$tmp,$src2\n\t"
5173             "pshufd   $tmp2,$tmp,0xE\n\t"
5174             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5175             "pshufd   $tmp2,$tmp,0x1\n\t"
5176             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5177             "movd     $tmp2,$src1\n\t"
5178             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5179             "movd     $dst,$tmp2\t! mul reduction8I" %}
5180   ins_encode %{
5181     int vector_len = 0;
5182     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5183     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5184     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5185     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5186     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5187     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5188     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5189     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5190     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5191   %}
5192   ins_pipe( pipe_slow );
5193 %}
5194 
5195 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, vecZ src2, regF tmp, regF tmp2, regF tmp3) %{
5196   predicate(UseAVX > 2);
5197   match(Set dst (MulReductionVI src1 src2));
5198   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5199   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5200             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5201             "vextracti128_high  $tmp,$tmp3\n\t"
5202             "vpmulld  $tmp,$tmp,$src2\n\t"
5203             "pshufd   $tmp2,$tmp,0xE\n\t"
5204             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5205             "pshufd   $tmp2,$tmp,0x1\n\t"
5206             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5207             "movd     $tmp2,$src1\n\t"
5208             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5209             "movd     $dst,$tmp2\t! mul reduction16I" %}
5210   ins_encode %{
5211     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5212     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5213     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5214     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5215     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5216     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5217     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5218     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5219     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5220     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5221     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5222   %}
5223   ins_pipe( pipe_slow );
5224 %}
5225 
5226 #ifdef _LP64
5227 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, regF tmp, regF tmp2) %{
5228   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5229   match(Set dst (MulReductionVL src1 src2));
5230   effect(TEMP tmp, TEMP tmp2);
5231   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5232             "vpmullq  $tmp,$src2,$tmp2\n\t"
5233             "movdq    $tmp2,$src1\n\t"
5234             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5235             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5236   ins_encode %{
5237     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5238     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5239     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5240     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5241     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5242   %}
5243   ins_pipe( pipe_slow );
5244 %}
5245 
5246 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, regF tmp, regF tmp2) %{
5247   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5248   match(Set dst (MulReductionVL src1 src2));
5249   effect(TEMP tmp, TEMP tmp2);
5250   format %{ "vextracti128_high  $tmp,$src2\n\t"
5251             "vpmullq  $tmp2,$tmp,$src2\n\t"
5252             "pshufd   $tmp,$tmp2,0xE\n\t"
5253             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5254             "movdq    $tmp,$src1\n\t"
5255             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5256             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5257   ins_encode %{
5258     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5259     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5260     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5261     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5262     __ movdq($tmp$$XMMRegister, $src1$$Register);
5263     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5264     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5265   %}
5266   ins_pipe( pipe_slow );
5267 %}
5268 
5269 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, vecZ src2, regF tmp, regF tmp2) %{
5270   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5271   match(Set dst (MulReductionVL src1 src2));
5272   effect(TEMP tmp, TEMP tmp2);
5273   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5274             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5275             "vextracti128_high  $tmp,$tmp2\n\t"
5276             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5277             "pshufd   $tmp,$tmp2,0xE\n\t"
5278             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5279             "movdq    $tmp,$src1\n\t"
5280             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5281             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5282   ins_encode %{
5283     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5284     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5285     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5286     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5287     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5288     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5289     __ movdq($tmp$$XMMRegister, $src1$$Register);
5290     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5291     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5292   %}
5293   ins_pipe( pipe_slow );
5294 %}
5295 #endif
5296 
5297 instruct rsmul2F_reduction(regF dst, vecD src2, regF tmp) %{
5298   predicate(UseSSE >= 1 && UseAVX == 0);
5299   match(Set dst (MulReductionVF dst src2));
5300   effect(TEMP dst, TEMP tmp);
5301   format %{ "mulss   $dst,$src2\n\t"
5302             "pshufd  $tmp,$src2,0x01\n\t"
5303             "mulss   $dst,$tmp\t! mul reduction2F" %}
5304   ins_encode %{
5305     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5306     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5307     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5308   %}
5309   ins_pipe( pipe_slow );
5310 %}
5311 
5312 instruct rvmul2F_reduction_reg(regF dst, vecD src2, regF tmp) %{
5313   predicate(UseAVX > 0);
5314   match(Set dst (MulReductionVF dst src2));
5315   effect(TEMP tmp, TEMP dst);
5316   format %{ "vmulss  $dst,$dst,$src2\n\t"
5317             "pshufd  $tmp,$src2,0x01\n\t"
5318             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5319   ins_encode %{
5320     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5321     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5322     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5323   %}
5324   ins_pipe( pipe_slow );
5325 %}
5326 
5327 instruct rsmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5328   predicate(UseSSE >= 1 && UseAVX == 0);
5329   match(Set dst (MulReductionVF dst src2));
5330   effect(TEMP dst, TEMP tmp);
5331   format %{ "mulss   $dst,$src2\n\t"
5332             "pshufd  $tmp,$src2,0x01\n\t"
5333             "mulss   $dst,$tmp\n\t"
5334             "pshufd  $tmp,$src2,0x02\n\t"
5335             "mulss   $dst,$tmp\n\t"
5336             "pshufd  $tmp,$src2,0x03\n\t"
5337             "mulss   $dst,$tmp\t! mul reduction4F" %}
5338   ins_encode %{
5339     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5340     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5341     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5342     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5343     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5344     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5345     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5346   %}
5347   ins_pipe( pipe_slow );
5348 %}
5349 
5350 instruct rvmul4F_reduction_reg(regF dst, vecX src2, regF tmp) %{
5351   predicate(UseAVX > 0);
5352   match(Set dst (MulReductionVF dst src2));
5353   effect(TEMP tmp, TEMP dst);
5354   format %{ "vmulss  $dst,$dst,$src2\n\t"
5355             "pshufd  $tmp,$src2,0x01\n\t"
5356             "vmulss  $dst,$dst,$tmp\n\t"
5357             "pshufd  $tmp,$src2,0x02\n\t"
5358             "vmulss  $dst,$dst,$tmp\n\t"
5359             "pshufd  $tmp,$src2,0x03\n\t"
5360             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5361   ins_encode %{
5362     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5363     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5364     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5365     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5366     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5367     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5368     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5369   %}
5370   ins_pipe( pipe_slow );
5371 %}
5372 
5373 instruct rvmul8F_reduction_reg(regF dst, vecY src2, regF tmp, regF tmp2) %{
5374   predicate(UseAVX > 0);
5375   match(Set dst (MulReductionVF dst src2));
5376   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5377   format %{ "vmulss  $dst,$dst,$src2\n\t"
5378             "pshufd  $tmp,$src2,0x01\n\t"
5379             "vmulss  $dst,$dst,$tmp\n\t"
5380             "pshufd  $tmp,$src2,0x02\n\t"
5381             "vmulss  $dst,$dst,$tmp\n\t"
5382             "pshufd  $tmp,$src2,0x03\n\t"
5383             "vmulss  $dst,$dst,$tmp\n\t"
5384             "vextractf128_high  $tmp2,$src2\n\t"
5385             "vmulss  $dst,$dst,$tmp2\n\t"
5386             "pshufd  $tmp,$tmp2,0x01\n\t"
5387             "vmulss  $dst,$dst,$tmp\n\t"
5388             "pshufd  $tmp,$tmp2,0x02\n\t"
5389             "vmulss  $dst,$dst,$tmp\n\t"
5390             "pshufd  $tmp,$tmp2,0x03\n\t"
5391             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5392   ins_encode %{
5393     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5394     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5395     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5396     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5397     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5398     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5399     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5400     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5401     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5402     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5403     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5404     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5405     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5406     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5407     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5408   %}
5409   ins_pipe( pipe_slow );
5410 %}
5411 
5412 instruct rvmul16F_reduction_reg(regF dst, vecZ src2, regF tmp, regF tmp2) %{
5413   predicate(UseAVX > 2);
5414   match(Set dst (MulReductionVF dst src2));
5415   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5416   format %{ "vmulss  $dst,$dst,$src2\n\t"
5417             "pshufd  $tmp,$src2,0x01\n\t"
5418             "vmulss  $dst,$dst,$tmp\n\t"
5419             "pshufd  $tmp,$src2,0x02\n\t"
5420             "vmulss  $dst,$dst,$tmp\n\t"
5421             "pshufd  $tmp,$src2,0x03\n\t"
5422             "vmulss  $dst,$dst,$tmp\n\t"
5423             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5424             "vmulss  $dst,$dst,$tmp2\n\t"
5425             "pshufd  $tmp,$tmp2,0x01\n\t"
5426             "vmulss  $dst,$dst,$tmp\n\t"
5427             "pshufd  $tmp,$tmp2,0x02\n\t"
5428             "vmulss  $dst,$dst,$tmp\n\t"
5429             "pshufd  $tmp,$tmp2,0x03\n\t"
5430             "vmulss  $dst,$dst,$tmp\n\t"
5431             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5432             "vmulss  $dst,$dst,$tmp2\n\t"
5433             "pshufd  $tmp,$tmp2,0x01\n\t"
5434             "vmulss  $dst,$dst,$tmp\n\t"
5435             "pshufd  $tmp,$tmp2,0x02\n\t"
5436             "vmulss  $dst,$dst,$tmp\n\t"
5437             "pshufd  $tmp,$tmp2,0x03\n\t"
5438             "vmulss  $dst,$dst,$tmp\n\t"
5439             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5440             "vmulss  $dst,$dst,$tmp2\n\t"
5441             "pshufd  $tmp,$tmp2,0x01\n\t"
5442             "vmulss  $dst,$dst,$tmp\n\t"
5443             "pshufd  $tmp,$tmp2,0x02\n\t"
5444             "vmulss  $dst,$dst,$tmp\n\t"
5445             "pshufd  $tmp,$tmp2,0x03\n\t"
5446             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5447   ins_encode %{
5448     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5449     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5450     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5451     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5452     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5453     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5454     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5455     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5456     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5457     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5458     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5459     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5460     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5461     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5462     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5463     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5464     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5465     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5466     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5467     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5468     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5469     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5470     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5471     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5472     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5473     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5474     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5475     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5476     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5477     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5478     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5479   %}
5480   ins_pipe( pipe_slow );
5481 %}
5482 
5483 instruct rsmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5484   predicate(UseSSE >= 1 && UseAVX == 0);
5485   match(Set dst (MulReductionVD dst src2));
5486   effect(TEMP dst, TEMP tmp);
5487   format %{ "mulsd   $dst,$src2\n\t"
5488             "pshufd  $tmp,$src2,0xE\n\t"
5489             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5490   ins_encode %{
5491     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5492     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5493     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5494   %}
5495   ins_pipe( pipe_slow );
5496 %}
5497 
5498 instruct rvmul2D_reduction_reg(regD dst, vecX src2, regD tmp) %{
5499   predicate(UseAVX > 0);
5500   match(Set dst (MulReductionVD dst src2));
5501   effect(TEMP tmp, TEMP dst);
5502   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5503             "pshufd  $tmp,$src2,0xE\n\t"
5504             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5505   ins_encode %{
5506     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5507     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5508     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5509   %}
5510   ins_pipe( pipe_slow );
5511 %}
5512 
5513 instruct rvmul4D_reduction_reg(regD dst, vecY src2, regD tmp, regD tmp2) %{
5514   predicate(UseAVX > 0);
5515   match(Set dst (MulReductionVD dst src2));
5516   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5517   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5518             "pshufd  $tmp,$src2,0xE\n\t"
5519             "vmulsd  $dst,$dst,$tmp\n\t"
5520             "vextractf128_high  $tmp2,$src2\n\t"
5521             "vmulsd  $dst,$dst,$tmp2\n\t"
5522             "pshufd  $tmp,$tmp2,0xE\n\t"
5523             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5524   ins_encode %{
5525     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5526     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5527     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5528     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5529     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5530     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5531     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5532   %}
5533   ins_pipe( pipe_slow );
5534 %}
5535 
5536 instruct rvmul8D_reduction_reg(regD dst, vecZ src2, regD tmp, regD tmp2) %{
5537   predicate(UseAVX > 2);
5538   match(Set dst (MulReductionVD dst src2));
5539   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5540   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5541             "pshufd  $tmp,$src2,0xE\n\t"
5542             "vmulsd  $dst,$dst,$tmp\n\t"
5543             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5544             "vmulsd  $dst,$dst,$tmp2\n\t"
5545             "pshufd  $tmp,$src2,0xE\n\t"
5546             "vmulsd  $dst,$dst,$tmp\n\t"
5547             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5548             "vmulsd  $dst,$dst,$tmp2\n\t"
5549             "pshufd  $tmp,$tmp2,0xE\n\t"
5550             "vmulsd  $dst,$dst,$tmp\n\t"
5551             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5552             "vmulsd  $dst,$dst,$tmp2\n\t"
5553             "pshufd  $tmp,$tmp2,0xE\n\t"
5554             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5555   ins_encode %{
5556     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5557     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5558     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5559     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5560     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5561     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5562     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5563     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5564     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5565     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5566     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5567     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5568     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5569     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5570     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5571   %}
5572   ins_pipe( pipe_slow );
5573 %}
5574 
5575 // ====================VECTOR ARITHMETIC=======================================
5576 
5577 // --------------------------------- ADD --------------------------------------
5578 
5579 // Bytes vector add
5580 instruct vadd4B(vecS dst, vecS src) %{
5581   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5582   match(Set dst (AddVB dst src));
5583   format %{ "paddb   $dst,$src\t! add packed4B" %}
5584   ins_encode %{
5585     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5586   %}
5587   ins_pipe( pipe_slow );
5588 %}
5589 
5590 instruct vadd4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
5591   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5592   match(Set dst (AddVB src1 src2));
5593   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5594   ins_encode %{
5595     int vector_len = 0;
5596     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5597   %}
5598   ins_pipe( pipe_slow );
5599 %}
5600 
5601 instruct vadd4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
5602   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5603   match(Set dst (AddVB src1 src2));
5604   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5605   ins_encode %{
5606     int vector_len = 0;
5607     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5608   %}
5609   ins_pipe( pipe_slow );
5610 %}
5611 
5612 instruct vadd4B_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
5613   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
5614   match(Set dst (AddVB dst src2));
5615   effect(TEMP src1);
5616   format %{ "vpaddb  $dst,$dst,$src2\t! add packed4B" %}
5617   ins_encode %{
5618     int vector_len = 0;
5619     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5620   %}
5621   ins_pipe( pipe_slow );
5622 %}
5623 
5624 instruct vadd4B_mem_avx(vecS dst, vecS src, memory mem) %{
5625   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5626   match(Set dst (AddVB src (LoadVector mem)));
5627   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5628   ins_encode %{
5629     int vector_len = 0;
5630     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5631   %}
5632   ins_pipe( pipe_slow );
5633 %}
5634 
5635 instruct vadd4B_mem_evex(vecS dst, vecS src, memory mem) %{
5636   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5637   match(Set dst (AddVB src (LoadVector mem)));
5638   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5639   ins_encode %{
5640     int vector_len = 0;
5641     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5642   %}
5643   ins_pipe( pipe_slow );
5644 %}
5645 
5646 instruct vadd4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
5647   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
5648   match(Set dst (AddVB dst (LoadVector mem)));
5649   effect(TEMP src);
5650   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5651   ins_encode %{
5652     int vector_len = 0;
5653     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5654   %}
5655   ins_pipe( pipe_slow );
5656 %}
5657 
5658 instruct vadd8B(vecD dst, vecD src) %{
5659   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
5660   match(Set dst (AddVB dst src));
5661   format %{ "paddb   $dst,$src\t! add packed8B" %}
5662   ins_encode %{
5663     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5664   %}
5665   ins_pipe( pipe_slow );
5666 %}
5667 
5668 instruct vadd8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
5669   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5670   match(Set dst (AddVB src1 src2));
5671   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5672   ins_encode %{
5673     int vector_len = 0;
5674     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5675   %}
5676   ins_pipe( pipe_slow );
5677 %}
5678 
5679 instruct vadd8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
5680   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5681   match(Set dst (AddVB src1 src2));
5682   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
5683   ins_encode %{
5684     int vector_len = 0;
5685     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5686   %}
5687   ins_pipe( pipe_slow );
5688 %}
5689 
5690 instruct vadd8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
5691   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
5692   match(Set dst (AddVB dst src2));
5693   effect(TEMP src1);
5694   format %{ "vpaddb  $dst,$dst,$src2\t! add packed8B" %}
5695   ins_encode %{
5696     int vector_len = 0;
5697     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5698   %}
5699   ins_pipe( pipe_slow );
5700 %}
5701 
5702 instruct vadd8B_mem_avx(vecD dst, vecD src, memory mem) %{
5703   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
5704   match(Set dst (AddVB src (LoadVector mem)));
5705   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5706   ins_encode %{
5707     int vector_len = 0;
5708     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5709   %}
5710   ins_pipe( pipe_slow );
5711 %}
5712 
5713 instruct vadd8B_mem_evex(vecD dst, vecD src, memory mem) %{
5714   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5715   match(Set dst (AddVB src (LoadVector mem)));
5716   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5717   ins_encode %{
5718     int vector_len = 0;
5719     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5720   %}
5721   ins_pipe( pipe_slow );
5722 %}
5723 
5724 instruct vadd8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
5725   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
5726   match(Set dst (AddVB dst (LoadVector mem)));
5727   effect(TEMP src);
5728   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
5729   ins_encode %{
5730     int vector_len = 0;
5731     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5732   %}
5733   ins_pipe( pipe_slow );
5734 %}
5735 
5736 instruct vadd16B(vecX dst, vecX src) %{
5737   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
5738   match(Set dst (AddVB dst src));
5739   format %{ "paddb   $dst,$src\t! add packed16B" %}
5740   ins_encode %{
5741     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5742   %}
5743   ins_pipe( pipe_slow );
5744 %}
5745 
5746 instruct vadd16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
5747   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
5748   match(Set dst (AddVB src1 src2));
5749   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5750   ins_encode %{
5751     int vector_len = 0;
5752     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5753   %}
5754   ins_pipe( pipe_slow );
5755 %}
5756 
5757 instruct vadd16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
5758   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5759   match(Set dst (AddVB src1 src2));
5760   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
5761   ins_encode %{
5762     int vector_len = 0;
5763     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5764   %}
5765   ins_pipe( pipe_slow );
5766 %}
5767 
5768 instruct vadd16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
5769   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
5770   match(Set dst (AddVB dst src2));
5771   effect(TEMP src1);
5772   format %{ "vpaddb  $dst,$dst,$src2\t! add packed16B" %}
5773   ins_encode %{
5774     int vector_len = 0;
5775     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5776   %}
5777   ins_pipe( pipe_slow );
5778 %}
5779 
5780 instruct vadd16B_mem_avx(vecX dst, vecX src, memory mem) %{
5781   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
5782   match(Set dst (AddVB src (LoadVector mem)));
5783   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5784   ins_encode %{
5785     int vector_len = 0;
5786     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5787   %}
5788   ins_pipe( pipe_slow );
5789 %}
5790 
5791 instruct vadd16B_mem_evex(vecX dst, vecX src, memory mem) %{
5792   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5793   match(Set dst (AddVB src (LoadVector mem)));
5794   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5795   ins_encode %{
5796     int vector_len = 0;
5797     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5798   %}
5799   ins_pipe( pipe_slow );
5800 %}
5801 
5802 instruct vadd16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
5803   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
5804   match(Set dst (AddVB dst (LoadVector mem)));
5805   effect(TEMP src);
5806   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
5807   ins_encode %{
5808     int vector_len = 0;
5809     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5810   %}
5811   ins_pipe( pipe_slow );
5812 %}
5813 
5814 instruct vadd32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
5815   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
5816   match(Set dst (AddVB src1 src2));
5817   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
5818   ins_encode %{
5819     int vector_len = 1;
5820     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5821   %}
5822   ins_pipe( pipe_slow );
5823 %}
5824 
5825 instruct vadd32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
5826   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5827   match(Set dst (AddVB src1 src2));
5828   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
5829   ins_encode %{
5830     int vector_len = 1;
5831     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5832   %}
5833   ins_pipe( pipe_slow );
5834 %}
5835 
5836 instruct vadd32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
5837   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
5838   match(Set dst (AddVB dst src2));
5839   effect(TEMP src1);
5840   format %{ "vpaddb  $dst,$dst,$src2\t! add packed32B" %}
5841   ins_encode %{
5842     int vector_len = 1;
5843     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5844   %}
5845   ins_pipe( pipe_slow );
5846 %}
5847 
5848 instruct vadd32B_mem_avx(vecY dst, vecY src, memory mem) %{
5849   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
5850   match(Set dst (AddVB src (LoadVector mem)));
5851   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5852   ins_encode %{
5853     int vector_len = 1;
5854     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5855   %}
5856   ins_pipe( pipe_slow );
5857 %}
5858 
5859 instruct vadd32B_mem_evex(vecY dst, vecY src, memory mem) %{
5860   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5861   match(Set dst (AddVB src (LoadVector mem)));
5862   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5863   ins_encode %{
5864     int vector_len = 1;
5865     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5866   %}
5867   ins_pipe( pipe_slow );
5868 %}
5869 
5870 instruct vadd32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
5871   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
5872   match(Set dst (AddVB dst (LoadVector mem)));
5873   effect(TEMP src);
5874   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
5875   ins_encode %{
5876     int vector_len = 1;
5877     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5878   %}
5879   ins_pipe( pipe_slow );
5880 %}
5881 
5882 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
5883   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
5884   match(Set dst (AddVB src1 src2));
5885   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
5886   ins_encode %{
5887     int vector_len = 2;
5888     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5889   %}
5890   ins_pipe( pipe_slow );
5891 %}
5892 
5893 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
5894   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
5895   match(Set dst (AddVB src (LoadVector mem)));
5896   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
5897   ins_encode %{
5898     int vector_len = 2;
5899     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5900   %}
5901   ins_pipe( pipe_slow );
5902 %}
5903 
5904 // Shorts/Chars vector add
5905 instruct vadd2S(vecS dst, vecS src) %{
5906   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
5907   match(Set dst (AddVS dst src));
5908   format %{ "paddw   $dst,$src\t! add packed2S" %}
5909   ins_encode %{
5910     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5911   %}
5912   ins_pipe( pipe_slow );
5913 %}
5914 
5915 instruct vadd2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
5916   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
5917   match(Set dst (AddVS src1 src2));
5918   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
5919   ins_encode %{
5920     int vector_len = 0;
5921     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5922   %}
5923   ins_pipe( pipe_slow );
5924 %}
5925 
5926 instruct vadd2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
5927   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
5928   match(Set dst (AddVS src1 src2));
5929   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
5930   ins_encode %{
5931     int vector_len = 0;
5932     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5933   %}
5934   ins_pipe( pipe_slow );
5935 %}
5936 
5937 instruct vadd2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
5938   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
5939   match(Set dst (AddVS dst src2));
5940   effect(TEMP src1);
5941   format %{ "vpaddw  $dst,$dst,$src2\t! add packed2S" %}
5942   ins_encode %{
5943     int vector_len = 0;
5944     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5945   %}
5946   ins_pipe( pipe_slow );
5947 %}
5948 
5949 instruct vadd2S_mem_avx(vecS dst, vecS src, memory mem) %{
5950   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
5951   match(Set dst (AddVS src (LoadVector mem)));
5952   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5953   ins_encode %{
5954     int vector_len = 0;
5955     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5956   %}
5957   ins_pipe( pipe_slow );
5958 %}
5959 
5960 instruct vadd2S_mem_evex(vecS dst, vecS src, memory mem) %{
5961   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
5962   match(Set dst (AddVS src (LoadVector mem)));
5963   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5964   ins_encode %{
5965     int vector_len = 0;
5966     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5967   %}
5968   ins_pipe( pipe_slow );
5969 %}
5970 
5971 instruct vadd2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
5972   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
5973   match(Set dst (AddVS dst (LoadVector mem)));
5974   effect(TEMP src);
5975   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
5976   ins_encode %{
5977     int vector_len = 0;
5978     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5979   %}
5980   ins_pipe( pipe_slow );
5981 %}
5982 
5983 instruct vadd4S(vecD dst, vecD src) %{
5984   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5985   match(Set dst (AddVS dst src));
5986   format %{ "paddw   $dst,$src\t! add packed4S" %}
5987   ins_encode %{
5988     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5989   %}
5990   ins_pipe( pipe_slow );
5991 %}
5992 
5993 instruct vadd4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
5994   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
5995   match(Set dst (AddVS src1 src2));
5996   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
5997   ins_encode %{
5998     int vector_len = 0;
5999     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6000   %}
6001   ins_pipe( pipe_slow );
6002 %}
6003 
6004 instruct vadd4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
6005   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6006   match(Set dst (AddVS src1 src2));
6007   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
6008   ins_encode %{
6009     int vector_len = 0;
6010     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6011   %}
6012   ins_pipe( pipe_slow );
6013 %}
6014 
6015 instruct vadd4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
6016   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6017   match(Set dst (AddVS dst src2));
6018   effect(TEMP src1);
6019   format %{ "vpaddw  $dst,$dst,$src2\t! add packed4S" %}
6020   ins_encode %{
6021     int vector_len = 0;
6022     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6023   %}
6024   ins_pipe( pipe_slow );
6025 %}
6026 
6027 instruct vadd4S_mem_avx(vecD dst, vecD src, memory mem) %{
6028   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6029   match(Set dst (AddVS src (LoadVector mem)));
6030   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6031   ins_encode %{
6032     int vector_len = 0;
6033     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6034   %}
6035   ins_pipe( pipe_slow );
6036 %}
6037 
6038 instruct vadd4S_mem_evex(vecD dst, vecD src, memory mem) %{
6039   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6040   match(Set dst (AddVS src (LoadVector mem)));
6041   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6042   ins_encode %{
6043     int vector_len = 0;
6044     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6045   %}
6046   ins_pipe( pipe_slow );
6047 %}
6048 
6049 instruct vadd4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
6050   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6051   match(Set dst (AddVS dst (LoadVector mem)));
6052   effect(TEMP src);
6053   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6054   ins_encode %{
6055     int vector_len = 0;
6056     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6057   %}
6058   ins_pipe( pipe_slow );
6059 %}
6060 
6061 instruct vadd8S(vecX dst, vecX src) %{
6062   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6063   match(Set dst (AddVS dst src));
6064   format %{ "paddw   $dst,$src\t! add packed8S" %}
6065   ins_encode %{
6066     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6067   %}
6068   ins_pipe( pipe_slow );
6069 %}
6070 
6071 instruct vadd8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
6072   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6073   match(Set dst (AddVS src1 src2));
6074   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6075   ins_encode %{
6076     int vector_len = 0;
6077     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6078   %}
6079   ins_pipe( pipe_slow );
6080 %}
6081 
6082 instruct vadd8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
6083   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6084   match(Set dst (AddVS src1 src2));
6085   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6086   ins_encode %{
6087     int vector_len = 0;
6088     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6089   %}
6090   ins_pipe( pipe_slow );
6091 %}
6092 
6093 instruct vadd8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
6094   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6095   match(Set dst (AddVS dst src2));
6096   effect(TEMP src1);
6097   format %{ "vpaddw  $dst,$dst,$src2\t! add packed8S" %}
6098   ins_encode %{
6099     int vector_len = 0;
6100     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6101   %}
6102   ins_pipe( pipe_slow );
6103 %}
6104 
6105 instruct vadd8S_mem_avx(vecX dst, vecX src, memory mem) %{
6106   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6107   match(Set dst (AddVS src (LoadVector mem)));
6108   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6109   ins_encode %{
6110     int vector_len = 0;
6111     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6112   %}
6113   ins_pipe( pipe_slow );
6114 %}
6115 
6116 instruct vadd8S_mem_evex(vecX dst, vecX src, memory mem) %{
6117   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6118   match(Set dst (AddVS src (LoadVector mem)));
6119   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6120   ins_encode %{
6121     int vector_len = 0;
6122     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6123   %}
6124   ins_pipe( pipe_slow );
6125 %}
6126 
6127 instruct vadd8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
6128   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6129   match(Set dst (AddVS dst (LoadVector mem)));
6130   effect(TEMP src);
6131   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6132   ins_encode %{
6133     int vector_len = 0;
6134     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6135   %}
6136   ins_pipe( pipe_slow );
6137 %}
6138 
6139 instruct vadd16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
6140   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6141   match(Set dst (AddVS src1 src2));
6142   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6143   ins_encode %{
6144     int vector_len = 1;
6145     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6146   %}
6147   ins_pipe( pipe_slow );
6148 %}
6149 
6150 instruct vadd16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
6151   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6152   match(Set dst (AddVS src1 src2));
6153   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6154   ins_encode %{
6155     int vector_len = 1;
6156     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6157   %}
6158   ins_pipe( pipe_slow );
6159 %}
6160 
6161 instruct vadd16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
6162   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6163   match(Set dst (AddVS dst src2));
6164   effect(TEMP src1);
6165   format %{ "vpaddw  $dst,$dst,$src2\t! add packed16S" %}
6166   ins_encode %{
6167     int vector_len = 1;
6168     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6169   %}
6170   ins_pipe( pipe_slow );
6171 %}
6172 
6173 instruct vadd16S_mem_avx(vecY dst, vecY src, memory mem) %{
6174   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
6175   match(Set dst (AddVS src (LoadVector mem)));
6176   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6177   ins_encode %{
6178     int vector_len = 1;
6179     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6180   %}
6181   ins_pipe( pipe_slow );
6182 %}
6183 
6184 instruct vadd16S_mem_evex(vecY dst, vecY src, memory mem) %{
6185   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6186   match(Set dst (AddVS src (LoadVector mem)));
6187   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6188   ins_encode %{
6189     int vector_len = 1;
6190     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6191   %}
6192   ins_pipe( pipe_slow );
6193 %}
6194 
6195 instruct vadd16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
6196   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6197   match(Set dst (AddVS dst (LoadVector mem)));
6198   effect(TEMP src);
6199   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6200   ins_encode %{
6201     int vector_len = 1;
6202     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6203   %}
6204   ins_pipe( pipe_slow );
6205 %}
6206 
6207 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6208   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6209   match(Set dst (AddVS src1 src2));
6210   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6211   ins_encode %{
6212     int vector_len = 2;
6213     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6214   %}
6215   ins_pipe( pipe_slow );
6216 %}
6217 
6218 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6219   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6220   match(Set dst (AddVS src (LoadVector mem)));
6221   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6222   ins_encode %{
6223     int vector_len = 2;
6224     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6225   %}
6226   ins_pipe( pipe_slow );
6227 %}
6228 
6229 // Integers vector add
6230 instruct vadd2I(vecD dst, vecD src) %{
6231   predicate(n->as_Vector()->length() == 2);
6232   match(Set dst (AddVI dst src));
6233   format %{ "paddd   $dst,$src\t! add packed2I" %}
6234   ins_encode %{
6235     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6236   %}
6237   ins_pipe( pipe_slow );
6238 %}
6239 
6240 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6241   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6242   match(Set dst (AddVI src1 src2));
6243   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6244   ins_encode %{
6245     int vector_len = 0;
6246     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6247   %}
6248   ins_pipe( pipe_slow );
6249 %}
6250 
6251 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6252   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6253   match(Set dst (AddVI src (LoadVector mem)));
6254   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6255   ins_encode %{
6256     int vector_len = 0;
6257     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6258   %}
6259   ins_pipe( pipe_slow );
6260 %}
6261 
6262 instruct vadd4I(vecX dst, vecX src) %{
6263   predicate(n->as_Vector()->length() == 4);
6264   match(Set dst (AddVI dst src));
6265   format %{ "paddd   $dst,$src\t! add packed4I" %}
6266   ins_encode %{
6267     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6268   %}
6269   ins_pipe( pipe_slow );
6270 %}
6271 
6272 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6273   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6274   match(Set dst (AddVI src1 src2));
6275   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6276   ins_encode %{
6277     int vector_len = 0;
6278     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6279   %}
6280   ins_pipe( pipe_slow );
6281 %}
6282 
6283 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6284   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6285   match(Set dst (AddVI src (LoadVector mem)));
6286   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6287   ins_encode %{
6288     int vector_len = 0;
6289     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6290   %}
6291   ins_pipe( pipe_slow );
6292 %}
6293 
6294 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6295   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6296   match(Set dst (AddVI src1 src2));
6297   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6298   ins_encode %{
6299     int vector_len = 1;
6300     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6301   %}
6302   ins_pipe( pipe_slow );
6303 %}
6304 
6305 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6306   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6307   match(Set dst (AddVI src (LoadVector mem)));
6308   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6309   ins_encode %{
6310     int vector_len = 1;
6311     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6312   %}
6313   ins_pipe( pipe_slow );
6314 %}
6315 
6316 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6317   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6318   match(Set dst (AddVI src1 src2));
6319   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6320   ins_encode %{
6321     int vector_len = 2;
6322     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6323   %}
6324   ins_pipe( pipe_slow );
6325 %}
6326 
6327 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6328   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6329   match(Set dst (AddVI src (LoadVector mem)));
6330   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6331   ins_encode %{
6332     int vector_len = 2;
6333     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6334   %}
6335   ins_pipe( pipe_slow );
6336 %}
6337 
6338 // Longs vector add
6339 instruct vadd2L(vecX dst, vecX src) %{
6340   predicate(n->as_Vector()->length() == 2);
6341   match(Set dst (AddVL dst src));
6342   format %{ "paddq   $dst,$src\t! add packed2L" %}
6343   ins_encode %{
6344     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6345   %}
6346   ins_pipe( pipe_slow );
6347 %}
6348 
6349 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6350   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6351   match(Set dst (AddVL src1 src2));
6352   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6353   ins_encode %{
6354     int vector_len = 0;
6355     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6356   %}
6357   ins_pipe( pipe_slow );
6358 %}
6359 
6360 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6361   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6362   match(Set dst (AddVL src (LoadVector mem)));
6363   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6364   ins_encode %{
6365     int vector_len = 0;
6366     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6367   %}
6368   ins_pipe( pipe_slow );
6369 %}
6370 
6371 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6372   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6373   match(Set dst (AddVL src1 src2));
6374   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6375   ins_encode %{
6376     int vector_len = 1;
6377     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6378   %}
6379   ins_pipe( pipe_slow );
6380 %}
6381 
6382 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6383   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6384   match(Set dst (AddVL src (LoadVector mem)));
6385   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6386   ins_encode %{
6387     int vector_len = 1;
6388     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6389   %}
6390   ins_pipe( pipe_slow );
6391 %}
6392 
6393 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6394   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6395   match(Set dst (AddVL src1 src2));
6396   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6397   ins_encode %{
6398     int vector_len = 2;
6399     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6400   %}
6401   ins_pipe( pipe_slow );
6402 %}
6403 
6404 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6405   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6406   match(Set dst (AddVL src (LoadVector mem)));
6407   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6408   ins_encode %{
6409     int vector_len = 2;
6410     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6411   %}
6412   ins_pipe( pipe_slow );
6413 %}
6414 
6415 // Floats vector add
6416 instruct vadd2F(vecD dst, vecD src) %{
6417   predicate(n->as_Vector()->length() == 2);
6418   match(Set dst (AddVF dst src));
6419   format %{ "addps   $dst,$src\t! add packed2F" %}
6420   ins_encode %{
6421     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6422   %}
6423   ins_pipe( pipe_slow );
6424 %}
6425 
6426 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6427   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6428   match(Set dst (AddVF src1 src2));
6429   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6430   ins_encode %{
6431     int vector_len = 0;
6432     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6433   %}
6434   ins_pipe( pipe_slow );
6435 %}
6436 
6437 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6438   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6439   match(Set dst (AddVF src (LoadVector mem)));
6440   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6441   ins_encode %{
6442     int vector_len = 0;
6443     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6444   %}
6445   ins_pipe( pipe_slow );
6446 %}
6447 
6448 instruct vadd4F(vecX dst, vecX src) %{
6449   predicate(n->as_Vector()->length() == 4);
6450   match(Set dst (AddVF dst src));
6451   format %{ "addps   $dst,$src\t! add packed4F" %}
6452   ins_encode %{
6453     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6454   %}
6455   ins_pipe( pipe_slow );
6456 %}
6457 
6458 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6459   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6460   match(Set dst (AddVF src1 src2));
6461   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6462   ins_encode %{
6463     int vector_len = 0;
6464     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6465   %}
6466   ins_pipe( pipe_slow );
6467 %}
6468 
6469 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6470   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6471   match(Set dst (AddVF src (LoadVector mem)));
6472   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6473   ins_encode %{
6474     int vector_len = 0;
6475     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6476   %}
6477   ins_pipe( pipe_slow );
6478 %}
6479 
6480 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6481   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6482   match(Set dst (AddVF src1 src2));
6483   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6484   ins_encode %{
6485     int vector_len = 1;
6486     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6487   %}
6488   ins_pipe( pipe_slow );
6489 %}
6490 
6491 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6492   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6493   match(Set dst (AddVF src (LoadVector mem)));
6494   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6495   ins_encode %{
6496     int vector_len = 1;
6497     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6498   %}
6499   ins_pipe( pipe_slow );
6500 %}
6501 
6502 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6503   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6504   match(Set dst (AddVF src1 src2));
6505   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6506   ins_encode %{
6507     int vector_len = 2;
6508     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6509   %}
6510   ins_pipe( pipe_slow );
6511 %}
6512 
6513 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6514   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6515   match(Set dst (AddVF src (LoadVector mem)));
6516   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6517   ins_encode %{
6518     int vector_len = 2;
6519     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6520   %}
6521   ins_pipe( pipe_slow );
6522 %}
6523 
6524 // Doubles vector add
6525 instruct vadd2D(vecX dst, vecX src) %{
6526   predicate(n->as_Vector()->length() == 2);
6527   match(Set dst (AddVD dst src));
6528   format %{ "addpd   $dst,$src\t! add packed2D" %}
6529   ins_encode %{
6530     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6531   %}
6532   ins_pipe( pipe_slow );
6533 %}
6534 
6535 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6536   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6537   match(Set dst (AddVD src1 src2));
6538   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6539   ins_encode %{
6540     int vector_len = 0;
6541     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6542   %}
6543   ins_pipe( pipe_slow );
6544 %}
6545 
6546 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6547   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6548   match(Set dst (AddVD src (LoadVector mem)));
6549   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6550   ins_encode %{
6551     int vector_len = 0;
6552     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6553   %}
6554   ins_pipe( pipe_slow );
6555 %}
6556 
6557 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
6558   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6559   match(Set dst (AddVD src1 src2));
6560   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6561   ins_encode %{
6562     int vector_len = 1;
6563     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6564   %}
6565   ins_pipe( pipe_slow );
6566 %}
6567 
6568 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6569   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6570   match(Set dst (AddVD src (LoadVector mem)));
6571   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
6572   ins_encode %{
6573     int vector_len = 1;
6574     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6575   %}
6576   ins_pipe( pipe_slow );
6577 %}
6578 
6579 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6580   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6581   match(Set dst (AddVD src1 src2));
6582   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6583   ins_encode %{
6584     int vector_len = 2;
6585     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6586   %}
6587   ins_pipe( pipe_slow );
6588 %}
6589 
6590 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6591   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6592   match(Set dst (AddVD src (LoadVector mem)));
6593   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6594   ins_encode %{
6595     int vector_len = 2;
6596     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6597   %}
6598   ins_pipe( pipe_slow );
6599 %}
6600 
6601 // --------------------------------- SUB --------------------------------------
6602 
6603 // Bytes vector sub
6604 instruct vsub4B(vecS dst, vecS src) %{
6605   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6606   match(Set dst (SubVB dst src));
6607   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6608   ins_encode %{
6609     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6610   %}
6611   ins_pipe( pipe_slow );
6612 %}
6613 
6614 instruct vsub4B_reg_avx(vecS dst, vecS src1, vecS src2) %{
6615   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6616   match(Set dst (SubVB src1 src2));
6617   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6618   ins_encode %{
6619     int vector_len = 0;
6620     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6621   %}
6622   ins_pipe( pipe_slow );
6623 %}
6624 
6625 instruct vsub4B_reg_evex(vecS dst, vecS src1, vecS src2) %{
6626   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6627   match(Set dst (SubVB src1 src2));
6628   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6629   ins_encode %{
6630     int vector_len = 0;
6631     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6632   %}
6633   ins_pipe( pipe_slow );
6634 %}
6635 
6636 instruct vsub4B_reg_exex_special(vecS dst, vecS src1, vecS src2) %{
6637   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6638   match(Set dst (SubVB dst src2));
6639   effect(TEMP src1);
6640   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6641   ins_encode %{
6642     int vector_len = 0;
6643     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6644   %}
6645   ins_pipe( pipe_slow );
6646 %}
6647 
6648 instruct vsub4B_mem_avx(vecS dst, vecS src, memory mem) %{
6649   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
6650   match(Set dst (SubVB src (LoadVector mem)));
6651   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6652   ins_encode %{
6653     int vector_len = 0;
6654     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6655   %}
6656   ins_pipe( pipe_slow );
6657 %}
6658 
6659 instruct vsub4B_mem_evex(vecS dst, vecS src, memory mem) %{
6660   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
6661   match(Set dst (SubVB src (LoadVector mem)));
6662   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6663   ins_encode %{
6664     int vector_len = 0;
6665     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6666   %}
6667   ins_pipe( pipe_slow );
6668 %}
6669 
6670 instruct vsub4B_mem_evex_special(vecS dst, vecS src, memory mem) %{
6671   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
6672   match(Set dst (SubVB dst (LoadVector mem)));
6673   effect(TEMP src);
6674   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6675   ins_encode %{
6676     int vector_len = 0;
6677     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6678   %}
6679   ins_pipe( pipe_slow );
6680 %}
6681 
6682 instruct vsub8B(vecD dst, vecD src) %{
6683   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6684   match(Set dst (SubVB dst src));
6685   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6686   ins_encode %{
6687     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6688   %}
6689   ins_pipe( pipe_slow );
6690 %}
6691 
6692 instruct vsub8B_reg_avx(vecD dst, vecD src1, vecD src2) %{
6693   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6694   match(Set dst (SubVB src1 src2));
6695   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6696   ins_encode %{
6697     int vector_len = 0;
6698     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6699   %}
6700   ins_pipe( pipe_slow );
6701 %}
6702 
6703 instruct vsub8B_reg_evex(vecD dst, vecD src1, vecD src2) %{
6704   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6705   match(Set dst (SubVB src1 src2));
6706   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6707   ins_encode %{
6708     int vector_len = 0;
6709     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6710   %}
6711   ins_pipe( pipe_slow );
6712 %}
6713 
6714 instruct vsub8B_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
6715   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6716   match(Set dst (SubVB dst src2));
6717   effect(TEMP src1);
6718   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6719   ins_encode %{
6720     int vector_len = 0;
6721     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6722   %}
6723   ins_pipe( pipe_slow );
6724 %}
6725 
6726 instruct vsub8B_mem_avx(vecD dst, vecD src, memory mem) %{
6727   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
6728   match(Set dst (SubVB src (LoadVector mem)));
6729   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6730   ins_encode %{
6731     int vector_len = 0;
6732     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6733   %}
6734   ins_pipe( pipe_slow );
6735 %}
6736 
6737 instruct vsub8B_mem_evex(vecD dst, vecD src, memory mem) %{
6738   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
6739   match(Set dst (SubVB src (LoadVector mem)));
6740   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6741   ins_encode %{
6742     int vector_len = 0;
6743     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6744   %}
6745   ins_pipe( pipe_slow );
6746 %}
6747 
6748 instruct vsub8B_mem_evex_special(vecD dst, vecD src, memory mem) %{
6749   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
6750   match(Set dst (SubVB dst (LoadVector mem)));
6751   effect(TEMP src);
6752   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6753   ins_encode %{
6754     int vector_len = 0;
6755     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6756   %}
6757   ins_pipe( pipe_slow );
6758 %}
6759 
6760 instruct vsub16B(vecX dst, vecX src) %{
6761   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6762   match(Set dst (SubVB dst src));
6763   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6764   ins_encode %{
6765     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6766   %}
6767   ins_pipe( pipe_slow );
6768 %}
6769 
6770 instruct vsub16B_reg_avx(vecX dst, vecX src1, vecX src2) %{
6771   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
6772   match(Set dst (SubVB src1 src2));
6773   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6774   ins_encode %{
6775     int vector_len = 0;
6776     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6777   %}
6778   ins_pipe( pipe_slow );
6779 %}
6780 
6781 instruct vsub16B_reg_evex(vecX dst, vecX src1, vecX src2) %{
6782   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6783   match(Set dst (SubVB src1 src2));
6784   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6785   ins_encode %{
6786     int vector_len = 0;
6787     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6788   %}
6789   ins_pipe( pipe_slow );
6790 %}
6791 
6792 instruct vsub16B_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
6793   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6794   match(Set dst (SubVB dst src2));
6795   effect(TEMP src1);
6796   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6797   ins_encode %{
6798     int vector_len = 0;
6799     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6800   %}
6801   ins_pipe( pipe_slow );
6802 %}
6803 
6804 instruct vsub16B_mem_avx(vecX dst, vecX src, memory mem) %{
6805   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
6806   match(Set dst (SubVB src (LoadVector mem)));
6807   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6808   ins_encode %{
6809     int vector_len = 0;
6810     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6811   %}
6812   ins_pipe( pipe_slow );
6813 %}
6814 
6815 instruct vsub16B_mem_evex(vecX dst, vecX src, memory mem) %{
6816   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
6817   match(Set dst (SubVB src (LoadVector mem)));
6818   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6819   ins_encode %{
6820     int vector_len = 0;
6821     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6822   %}
6823   ins_pipe( pipe_slow );
6824 %}
6825 
6826 instruct vsub16B_mem_evex_special(vecX dst, vecX src, memory mem) %{
6827   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
6828   match(Set dst (SubVB dst (LoadVector mem)));
6829   effect(TEMP src);
6830   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6831   ins_encode %{
6832     int vector_len = 0;
6833     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6834   %}
6835   ins_pipe( pipe_slow );
6836 %}
6837 
6838 instruct vsub32B_reg_avx(vecY dst, vecY src1, vecY src2) %{
6839   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
6840   match(Set dst (SubVB src1 src2));
6841   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6842   ins_encode %{
6843     int vector_len = 1;
6844     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6845   %}
6846   ins_pipe( pipe_slow );
6847 %}
6848 
6849 instruct vsub32B_reg_evex(vecY dst, vecY src1, vecY src2) %{
6850   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6851   match(Set dst (SubVB src1 src2));
6852   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6853   ins_encode %{
6854     int vector_len = 1;
6855     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6856   %}
6857   ins_pipe( pipe_slow );
6858 %}
6859 
6860 instruct vsub32B_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
6861   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
6862   match(Set dst (SubVB dst src2));
6863   effect(TEMP src1);
6864   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6865   ins_encode %{
6866     int vector_len = 1;
6867     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6868   %}
6869   ins_pipe( pipe_slow );
6870 %}
6871 
6872 instruct vsub32B_mem_avx(vecY dst, vecY src, memory mem) %{
6873   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 32);
6874   match(Set dst (SubVB src (LoadVector mem)));
6875   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6876   ins_encode %{
6877     int vector_len = 1;
6878     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6879   %}
6880   ins_pipe( pipe_slow );
6881 %}
6882 
6883 instruct vsub32B_mem_evex(vecY dst, vecY src, memory mem) %{
6884   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6885   match(Set dst (SubVB src (LoadVector mem)));
6886   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6887   ins_encode %{
6888     int vector_len = 1;
6889     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6890   %}
6891   ins_pipe( pipe_slow );
6892 %}
6893 
6894 instruct vsub32B_mem_evex_special(vecY dst, vecY src, memory mem) %{
6895   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 32);
6896   match(Set dst (SubVB dst (LoadVector mem)));
6897   effect(TEMP src);
6898   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6899   ins_encode %{
6900     int vector_len = 1;
6901     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6902   %}
6903   ins_pipe( pipe_slow );
6904 %}
6905 
6906 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6907   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6908   match(Set dst (SubVB src1 src2));
6909   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
6910   ins_encode %{
6911     int vector_len = 2;
6912     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6913   %}
6914   ins_pipe( pipe_slow );
6915 %}
6916 
6917 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
6918   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6919   match(Set dst (SubVB src (LoadVector mem)));
6920   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
6921   ins_encode %{
6922     int vector_len = 2;
6923     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6924   %}
6925   ins_pipe( pipe_slow );
6926 %}
6927 
6928 // Shorts/Chars vector sub
6929 instruct vsub2S(vecS dst, vecS src) %{
6930   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6931   match(Set dst (SubVS dst src));
6932   format %{ "psubw   $dst,$src\t! sub packed2S" %}
6933   ins_encode %{
6934     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6935   %}
6936   ins_pipe( pipe_slow );
6937 %}
6938 
6939 instruct vsub2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
6940   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
6941   match(Set dst (SubVS src1 src2));
6942   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6943   ins_encode %{
6944     int vector_len = 0;
6945     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6946   %}
6947   ins_pipe( pipe_slow );
6948 %}
6949 
6950 instruct vsub2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
6951   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6952   match(Set dst (SubVS src1 src2));
6953   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6954   ins_encode %{
6955     int vector_len = 0;
6956     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6957   %}
6958   ins_pipe( pipe_slow );
6959 %}
6960 
6961 instruct vsub2S_reg_evex_special(vecS dst, vecS src1, vecS src2) %{
6962   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
6963   match(Set dst (SubVS dst src2));
6964   effect(TEMP src1);
6965   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6966   ins_encode %{
6967     int vector_len = 0;
6968     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6969   %}
6970   ins_pipe( pipe_slow );
6971 %}
6972 
6973 instruct vsub2S_mem_avx(vecS dst, vecS src, memory mem) %{
6974   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
6975   match(Set dst (SubVS src (LoadVector mem)));
6976   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6977   ins_encode %{
6978     int vector_len = 0;
6979     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6980   %}
6981   ins_pipe( pipe_slow );
6982 %}
6983 
6984 instruct vsub2S_mem_evex(vecS dst, vecS src, memory mem) %{
6985   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
6986   match(Set dst (SubVS src (LoadVector mem)));
6987   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6988   ins_encode %{
6989     int vector_len = 0;
6990     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6991   %}
6992   ins_pipe( pipe_slow );
6993 %}
6994 
6995 instruct vsub2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
6996   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
6997   match(Set dst (SubVS dst (LoadVector mem)));
6998   effect(TEMP src);
6999   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
7000   ins_encode %{
7001     int vector_len = 0;
7002     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7003   %}
7004   ins_pipe( pipe_slow );
7005 %}
7006 
7007 instruct vsub4S(vecD dst, vecD src) %{
7008   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7009   match(Set dst (SubVS dst src));
7010   format %{ "psubw   $dst,$src\t! sub packed4S" %}
7011   ins_encode %{
7012     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
7013   %}
7014   ins_pipe( pipe_slow );
7015 %}
7016 
7017 instruct vsub4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
7018   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7019   match(Set dst (SubVS src1 src2));
7020   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
7021   ins_encode %{
7022     int vector_len = 0;
7023     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7024   %}
7025   ins_pipe( pipe_slow );
7026 %}
7027 
7028 instruct vsub4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
7029   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7030   match(Set dst (SubVS src1 src2));
7031   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
7032   ins_encode %{
7033     int vector_len = 0;
7034     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7035   %}
7036   ins_pipe( pipe_slow );
7037 %}
7038 
7039 instruct vsub4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
7040   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7041   match(Set dst (SubVS dst src2));
7042   effect(TEMP src1);
7043   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
7044   ins_encode %{
7045     int vector_len = 0;
7046     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7047   %}
7048   ins_pipe( pipe_slow );
7049 %}
7050 
7051 instruct vsub4S_mem_avx(vecD dst, vecD src, memory mem) %{
7052   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7053   match(Set dst (SubVS src (LoadVector mem)));
7054   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
7055   ins_encode %{
7056     int vector_len = 0;
7057     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7058   %}
7059   ins_pipe( pipe_slow );
7060 %}
7061 
7062 instruct vsub4S_mem_evex(vecD dst, vecD src, memory mem) %{
7063   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7064   match(Set dst (SubVS src (LoadVector mem)));
7065   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
7066   ins_encode %{
7067     int vector_len = 0;
7068     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7069   %}
7070   ins_pipe( pipe_slow );
7071 %}
7072 
7073 instruct vsub4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
7074   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7075   match(Set dst (SubVS dst (LoadVector mem)));
7076   effect(TEMP src);
7077   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
7078   ins_encode %{
7079     int vector_len = 0;
7080     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7081   %}
7082   ins_pipe( pipe_slow );
7083 %}
7084 
7085 instruct vsub8S(vecX dst, vecX src) %{
7086   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7087   match(Set dst (SubVS dst src));
7088   format %{ "psubw   $dst,$src\t! sub packed8S" %}
7089   ins_encode %{
7090     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
7091   %}
7092   ins_pipe( pipe_slow );
7093 %}
7094 
7095 instruct vsub8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
7096   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7097   match(Set dst (SubVS src1 src2));
7098   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
7099   ins_encode %{
7100     int vector_len = 0;
7101     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7102   %}
7103   ins_pipe( pipe_slow );
7104 %}
7105 
7106 instruct vsub8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
7107   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7108   match(Set dst (SubVS src1 src2));
7109   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
7110   ins_encode %{
7111     int vector_len = 0;
7112     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7113   %}
7114   ins_pipe( pipe_slow );
7115 %}
7116 
7117 instruct vsub8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
7118   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7119   match(Set dst (SubVS dst src2));
7120   effect(TEMP src1);
7121   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
7122   ins_encode %{
7123     int vector_len = 0;
7124     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7125   %}
7126   ins_pipe( pipe_slow );
7127 %}
7128 
7129 instruct vsub8S_mem_avx(vecX dst, vecX src, memory mem) %{
7130   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7131   match(Set dst (SubVS src (LoadVector mem)));
7132   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7133   ins_encode %{
7134     int vector_len = 0;
7135     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7136   %}
7137   ins_pipe( pipe_slow );
7138 %}
7139 
7140 instruct vsub8S_mem_evex(vecX dst, vecX src, memory mem) %{
7141   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7142   match(Set dst (SubVS src (LoadVector mem)));
7143   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7144   ins_encode %{
7145     int vector_len = 0;
7146     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7147   %}
7148   ins_pipe( pipe_slow );
7149 %}
7150 
7151 instruct vsub8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
7152   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7153   match(Set dst (SubVS dst (LoadVector mem)));
7154   effect(TEMP src);
7155   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
7156   ins_encode %{
7157     int vector_len = 0;
7158     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7159   %}
7160   ins_pipe( pipe_slow );
7161 %}
7162 
7163 instruct vsub16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
7164   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7165   match(Set dst (SubVS src1 src2));
7166   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7167   ins_encode %{
7168     int vector_len = 1;
7169     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7170   %}
7171   ins_pipe( pipe_slow );
7172 %}
7173 
7174 instruct vsub16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
7175   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7176   match(Set dst (SubVS src1 src2));
7177   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7178   ins_encode %{
7179     int vector_len = 1;
7180     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7181   %}
7182   ins_pipe( pipe_slow );
7183 %}
7184 
7185 instruct vsub16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
7186   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7187   match(Set dst (SubVS dst src2));
7188   effect(TEMP src1);
7189   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
7190   ins_encode %{
7191     int vector_len = 1;
7192     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7193   %}
7194   ins_pipe( pipe_slow );
7195 %}
7196 
7197 instruct vsub16S_mem_avx(vecY dst, vecY src, memory mem) %{
7198   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7199   match(Set dst (SubVS src (LoadVector mem)));
7200   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7201   ins_encode %{
7202     int vector_len = 1;
7203     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7204   %}
7205   ins_pipe( pipe_slow );
7206 %}
7207 
7208 instruct vsub16S_mem_evex(vecY dst, vecY src, memory mem) %{
7209   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7210   match(Set dst (SubVS src (LoadVector mem)));
7211   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7212   ins_encode %{
7213     int vector_len = 1;
7214     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7215   %}
7216   ins_pipe( pipe_slow );
7217 %}
7218 
7219 instruct vsub16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
7220   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7221   match(Set dst (SubVS dst (LoadVector mem)));
7222    effect(TEMP src);
7223   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
7224   ins_encode %{
7225     int vector_len = 1;
7226     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7227   %}
7228   ins_pipe( pipe_slow );
7229 %}
7230 
7231 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7232   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7233   match(Set dst (SubVS src1 src2));
7234   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
7235   ins_encode %{
7236     int vector_len = 2;
7237     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7238   %}
7239   ins_pipe( pipe_slow );
7240 %}
7241 
7242 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
7243   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7244   match(Set dst (SubVS src (LoadVector mem)));
7245   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
7246   ins_encode %{
7247     int vector_len = 2;
7248     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7249   %}
7250   ins_pipe( pipe_slow );
7251 %}
7252 
7253 // Integers vector sub
7254 instruct vsub2I(vecD dst, vecD src) %{
7255   predicate(n->as_Vector()->length() == 2);
7256   match(Set dst (SubVI dst src));
7257   format %{ "psubd   $dst,$src\t! sub packed2I" %}
7258   ins_encode %{
7259     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7260   %}
7261   ins_pipe( pipe_slow );
7262 %}
7263 
7264 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
7265   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7266   match(Set dst (SubVI src1 src2));
7267   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
7268   ins_encode %{
7269     int vector_len = 0;
7270     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7271   %}
7272   ins_pipe( pipe_slow );
7273 %}
7274 
7275 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
7276   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7277   match(Set dst (SubVI src (LoadVector mem)));
7278   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
7279   ins_encode %{
7280     int vector_len = 0;
7281     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7282   %}
7283   ins_pipe( pipe_slow );
7284 %}
7285 
7286 instruct vsub4I(vecX dst, vecX src) %{
7287   predicate(n->as_Vector()->length() == 4);
7288   match(Set dst (SubVI dst src));
7289   format %{ "psubd   $dst,$src\t! sub packed4I" %}
7290   ins_encode %{
7291     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
7292   %}
7293   ins_pipe( pipe_slow );
7294 %}
7295 
7296 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
7297   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7298   match(Set dst (SubVI src1 src2));
7299   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
7300   ins_encode %{
7301     int vector_len = 0;
7302     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7303   %}
7304   ins_pipe( pipe_slow );
7305 %}
7306 
7307 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
7308   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7309   match(Set dst (SubVI src (LoadVector mem)));
7310   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
7311   ins_encode %{
7312     int vector_len = 0;
7313     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7314   %}
7315   ins_pipe( pipe_slow );
7316 %}
7317 
7318 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
7319   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7320   match(Set dst (SubVI src1 src2));
7321   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
7322   ins_encode %{
7323     int vector_len = 1;
7324     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7325   %}
7326   ins_pipe( pipe_slow );
7327 %}
7328 
7329 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
7330   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7331   match(Set dst (SubVI src (LoadVector mem)));
7332   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
7333   ins_encode %{
7334     int vector_len = 1;
7335     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7336   %}
7337   ins_pipe( pipe_slow );
7338 %}
7339 
7340 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7341   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7342   match(Set dst (SubVI src1 src2));
7343   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
7344   ins_encode %{
7345     int vector_len = 2;
7346     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7347   %}
7348   ins_pipe( pipe_slow );
7349 %}
7350 
7351 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
7352   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7353   match(Set dst (SubVI src (LoadVector mem)));
7354   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
7355   ins_encode %{
7356     int vector_len = 2;
7357     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7358   %}
7359   ins_pipe( pipe_slow );
7360 %}
7361 
7362 // Longs vector sub
7363 instruct vsub2L(vecX dst, vecX src) %{
7364   predicate(n->as_Vector()->length() == 2);
7365   match(Set dst (SubVL dst src));
7366   format %{ "psubq   $dst,$src\t! sub packed2L" %}
7367   ins_encode %{
7368     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
7369   %}
7370   ins_pipe( pipe_slow );
7371 %}
7372 
7373 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
7374   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7375   match(Set dst (SubVL src1 src2));
7376   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
7377   ins_encode %{
7378     int vector_len = 0;
7379     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7380   %}
7381   ins_pipe( pipe_slow );
7382 %}
7383 
7384 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
7385   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7386   match(Set dst (SubVL src (LoadVector mem)));
7387   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
7388   ins_encode %{
7389     int vector_len = 0;
7390     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7391   %}
7392   ins_pipe( pipe_slow );
7393 %}
7394 
7395 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
7396   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7397   match(Set dst (SubVL src1 src2));
7398   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
7399   ins_encode %{
7400     int vector_len = 1;
7401     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7402   %}
7403   ins_pipe( pipe_slow );
7404 %}
7405 
7406 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
7407   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7408   match(Set dst (SubVL src (LoadVector mem)));
7409   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
7410   ins_encode %{
7411     int vector_len = 1;
7412     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7413   %}
7414   ins_pipe( pipe_slow );
7415 %}
7416 
7417 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7418   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7419   match(Set dst (SubVL src1 src2));
7420   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
7421   ins_encode %{
7422     int vector_len = 2;
7423     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7424   %}
7425   ins_pipe( pipe_slow );
7426 %}
7427 
7428 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
7429   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7430   match(Set dst (SubVL src (LoadVector mem)));
7431   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
7432   ins_encode %{
7433     int vector_len = 2;
7434     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7435   %}
7436   ins_pipe( pipe_slow );
7437 %}
7438 
7439 // Floats vector sub
7440 instruct vsub2F(vecD dst, vecD src) %{
7441   predicate(n->as_Vector()->length() == 2);
7442   match(Set dst (SubVF dst src));
7443   format %{ "subps   $dst,$src\t! sub packed2F" %}
7444   ins_encode %{
7445     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7446   %}
7447   ins_pipe( pipe_slow );
7448 %}
7449 
7450 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
7451   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7452   match(Set dst (SubVF src1 src2));
7453   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
7454   ins_encode %{
7455     int vector_len = 0;
7456     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7457   %}
7458   ins_pipe( pipe_slow );
7459 %}
7460 
7461 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
7462   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7463   match(Set dst (SubVF src (LoadVector mem)));
7464   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
7465   ins_encode %{
7466     int vector_len = 0;
7467     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7468   %}
7469   ins_pipe( pipe_slow );
7470 %}
7471 
7472 instruct vsub4F(vecX dst, vecX src) %{
7473   predicate(n->as_Vector()->length() == 4);
7474   match(Set dst (SubVF dst src));
7475   format %{ "subps   $dst,$src\t! sub packed4F" %}
7476   ins_encode %{
7477     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7478   %}
7479   ins_pipe( pipe_slow );
7480 %}
7481 
7482 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
7483   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7484   match(Set dst (SubVF src1 src2));
7485   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
7486   ins_encode %{
7487     int vector_len = 0;
7488     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7489   %}
7490   ins_pipe( pipe_slow );
7491 %}
7492 
7493 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
7494   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7495   match(Set dst (SubVF src (LoadVector mem)));
7496   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
7497   ins_encode %{
7498     int vector_len = 0;
7499     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7500   %}
7501   ins_pipe( pipe_slow );
7502 %}
7503 
7504 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
7505   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7506   match(Set dst (SubVF src1 src2));
7507   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
7508   ins_encode %{
7509     int vector_len = 1;
7510     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7511   %}
7512   ins_pipe( pipe_slow );
7513 %}
7514 
7515 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
7516   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7517   match(Set dst (SubVF src (LoadVector mem)));
7518   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
7519   ins_encode %{
7520     int vector_len = 1;
7521     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7522   %}
7523   ins_pipe( pipe_slow );
7524 %}
7525 
7526 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7527   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7528   match(Set dst (SubVF src1 src2));
7529   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
7530   ins_encode %{
7531     int vector_len = 2;
7532     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7533   %}
7534   ins_pipe( pipe_slow );
7535 %}
7536 
7537 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
7538   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7539   match(Set dst (SubVF src (LoadVector mem)));
7540   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
7541   ins_encode %{
7542     int vector_len = 2;
7543     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7544   %}
7545   ins_pipe( pipe_slow );
7546 %}
7547 
7548 // Doubles vector sub
7549 instruct vsub2D(vecX dst, vecX src) %{
7550   predicate(n->as_Vector()->length() == 2);
7551   match(Set dst (SubVD dst src));
7552   format %{ "subpd   $dst,$src\t! sub packed2D" %}
7553   ins_encode %{
7554     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
7555   %}
7556   ins_pipe( pipe_slow );
7557 %}
7558 
7559 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
7560   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7561   match(Set dst (SubVD src1 src2));
7562   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
7563   ins_encode %{
7564     int vector_len = 0;
7565     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7566   %}
7567   ins_pipe( pipe_slow );
7568 %}
7569 
7570 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
7571   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7572   match(Set dst (SubVD src (LoadVector mem)));
7573   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
7574   ins_encode %{
7575     int vector_len = 0;
7576     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7577   %}
7578   ins_pipe( pipe_slow );
7579 %}
7580 
7581 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
7582   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7583   match(Set dst (SubVD src1 src2));
7584   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
7585   ins_encode %{
7586     int vector_len = 1;
7587     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7588   %}
7589   ins_pipe( pipe_slow );
7590 %}
7591 
7592 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
7593   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7594   match(Set dst (SubVD src (LoadVector mem)));
7595   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
7596   ins_encode %{
7597     int vector_len = 1;
7598     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7599   %}
7600   ins_pipe( pipe_slow );
7601 %}
7602 
7603 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7604   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7605   match(Set dst (SubVD src1 src2));
7606   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
7607   ins_encode %{
7608     int vector_len = 2;
7609     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7610   %}
7611   ins_pipe( pipe_slow );
7612 %}
7613 
7614 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
7615   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7616   match(Set dst (SubVD src (LoadVector mem)));
7617   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
7618   ins_encode %{
7619     int vector_len = 2;
7620     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7621   %}
7622   ins_pipe( pipe_slow );
7623 %}
7624 
7625 // --------------------------------- MUL --------------------------------------
7626 
7627 // Shorts/Chars vector mul
7628 instruct vmul2S(vecS dst, vecS src) %{
7629   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7630   match(Set dst (MulVS dst src));
7631   format %{ "pmullw $dst,$src\t! mul packed2S" %}
7632   ins_encode %{
7633     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7634   %}
7635   ins_pipe( pipe_slow );
7636 %}
7637 
7638 instruct vmul2S_reg_avx(vecS dst, vecS src1, vecS src2) %{
7639   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
7640   match(Set dst (MulVS src1 src2));
7641   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7642   ins_encode %{
7643     int vector_len = 0;
7644     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7645   %}
7646   ins_pipe( pipe_slow );
7647 %}
7648 
7649 instruct vmul2S_reg_evex(vecS dst, vecS src1, vecS src2) %{
7650   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7651   match(Set dst (MulVS src1 src2));
7652   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7653   ins_encode %{
7654     int vector_len = 0;
7655     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7656   %}
7657   ins_pipe( pipe_slow );
7658 %}
7659 
7660 instruct vmul2S_evex_special(vecS dst, vecS src1, vecS src2) %{
7661   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7662   match(Set dst (MulVS dst src2));
7663   effect(TEMP src1);
7664   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7665   ins_encode %{
7666     int vector_len = 0;
7667     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7668   %}
7669   ins_pipe( pipe_slow );
7670 %}
7671 
7672 instruct vmul2S_mem_avx(vecS dst, vecS src, memory mem) %{
7673   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
7674   match(Set dst (MulVS src (LoadVector mem)));
7675   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7676   ins_encode %{
7677     int vector_len = 0;
7678     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7679   %}
7680   ins_pipe( pipe_slow );
7681 %}
7682 
7683 instruct vmul2S_mem_evex(vecS dst, vecS src, memory mem) %{
7684   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
7685   match(Set dst (MulVS src (LoadVector mem)));
7686   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7687   ins_encode %{
7688     int vector_len = 0;
7689     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7690   %}
7691   ins_pipe( pipe_slow );
7692 %}
7693 
7694 instruct vmul2S_mem_evex_special(vecS dst, vecS src, memory mem) %{
7695   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
7696   match(Set dst (MulVS dst (LoadVector mem)));
7697   effect(TEMP src);
7698   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7699   ins_encode %{
7700     int vector_len = 0;
7701     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7702   %}
7703   ins_pipe( pipe_slow );
7704 %}
7705 
7706 instruct vmul4S(vecD dst, vecD src) %{
7707   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7708   match(Set dst (MulVS dst src));
7709   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
7710   ins_encode %{
7711     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7712   %}
7713   ins_pipe( pipe_slow );
7714 %}
7715 
7716 instruct vmul4S_reg_avx(vecD dst, vecD src1, vecD src2) %{
7717   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7718   match(Set dst (MulVS src1 src2));
7719   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7720   ins_encode %{
7721     int vector_len = 0;
7722     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7723   %}
7724   ins_pipe( pipe_slow );
7725 %}
7726 
7727 instruct vmul4S_reg_evex(vecD dst, vecD src1, vecD src2) %{
7728   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7729   match(Set dst (MulVS src1 src2));
7730   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7731   ins_encode %{
7732     int vector_len = 0;
7733     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7734   %}
7735   ins_pipe( pipe_slow );
7736 %}
7737 
7738 instruct vmul4S_reg_evex_special(vecD dst, vecD src1, vecD src2) %{
7739   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7740   match(Set dst (MulVS dst src2));
7741   effect(TEMP src1);
7742   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7743   ins_encode %{
7744     int vector_len = 0;
7745     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7746   %}
7747   ins_pipe( pipe_slow );
7748 %}
7749 
7750 instruct vmul4S_mem_avx(vecD dst, vecD src, memory mem) %{
7751   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
7752   match(Set dst (MulVS src (LoadVector mem)));
7753   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7754   ins_encode %{
7755     int vector_len = 0;
7756     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7757   %}
7758   ins_pipe( pipe_slow );
7759 %}
7760 
7761 instruct vmul4S_mem_evex(vecD dst, vecD src, memory mem) %{
7762   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
7763   match(Set dst (MulVS src (LoadVector mem)));
7764   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7765   ins_encode %{
7766     int vector_len = 0;
7767     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7768   %}
7769   ins_pipe( pipe_slow );
7770 %}
7771 
7772 instruct vmul4S_mem_evex_special(vecD dst, vecD src, memory mem) %{
7773   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
7774   match(Set dst (MulVS dst (LoadVector mem)));
7775   effect(TEMP src);
7776   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7777   ins_encode %{
7778     int vector_len = 0;
7779     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7780   %}
7781   ins_pipe( pipe_slow );
7782 %}
7783 
7784 instruct vmul8S(vecX dst, vecX src) %{
7785   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7786   match(Set dst (MulVS dst src));
7787   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
7788   ins_encode %{
7789     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7790   %}
7791   ins_pipe( pipe_slow );
7792 %}
7793 
7794 instruct vmul8S_reg_avx(vecX dst, vecX src1, vecX src2) %{
7795   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7796   match(Set dst (MulVS src1 src2));
7797   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7798   ins_encode %{
7799     int vector_len = 0;
7800     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7801   %}
7802   ins_pipe( pipe_slow );
7803 %}
7804 
7805 instruct vmul8S_reg_evex(vecX dst, vecX src1, vecX src2) %{
7806   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7807   match(Set dst (MulVS src1 src2));
7808   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7809   ins_encode %{
7810     int vector_len = 0;
7811     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7812   %}
7813   ins_pipe( pipe_slow );
7814 %}
7815 
7816 instruct vmul8S_reg_evex_special(vecX dst, vecX src1, vecX src2) %{
7817   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7818   match(Set dst (MulVS dst src2));
7819   effect(TEMP src1);
7820   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7821   ins_encode %{
7822     int vector_len = 0;
7823     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7824   %}
7825   ins_pipe( pipe_slow );
7826 %}
7827 
7828 instruct vmul8S_mem_avx(vecX dst, vecX src, memory mem) %{
7829   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
7830   match(Set dst (MulVS src (LoadVector mem)));
7831   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7832   ins_encode %{
7833     int vector_len = 0;
7834     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7835   %}
7836   ins_pipe( pipe_slow );
7837 %}
7838 
7839 instruct vmul8S_mem_evex(vecX dst, vecX src, memory mem) %{
7840   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
7841   match(Set dst (MulVS src (LoadVector mem)));
7842   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7843   ins_encode %{
7844     int vector_len = 0;
7845     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7846   %}
7847   ins_pipe( pipe_slow );
7848 %}
7849 
7850 instruct vmul8S_mem_evex_special(vecX dst, vecX src, memory mem) %{
7851   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
7852   match(Set dst (MulVS dst (LoadVector mem)));
7853   effect(TEMP src);
7854   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7855   ins_encode %{
7856     int vector_len = 0;
7857     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7858   %}
7859   ins_pipe( pipe_slow );
7860 %}
7861 
7862 instruct vmul16S_reg_avx(vecY dst, vecY src1, vecY src2) %{
7863   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7864   match(Set dst (MulVS src1 src2));
7865   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7866   ins_encode %{
7867     int vector_len = 1;
7868     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7869   %}
7870   ins_pipe( pipe_slow );
7871 %}
7872 
7873 instruct vmul16S_reg_evex(vecY dst, vecY src1, vecY src2) %{
7874   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7875   match(Set dst (MulVS src1 src2));
7876   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7877   ins_encode %{
7878     int vector_len = 1;
7879     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7880   %}
7881   ins_pipe( pipe_slow );
7882 %}
7883 
7884 instruct vmul16S_reg_evex_special(vecY dst, vecY src1, vecY src2) %{
7885   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7886   match(Set dst (MulVS dst src2));
7887   effect(TEMP src1);
7888   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7889   ins_encode %{
7890     int vector_len = 1;
7891     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7892   %}
7893   ins_pipe( pipe_slow );
7894 %}
7895 
7896 instruct vmul16S_mem_avx(vecY dst, vecY src, memory mem) %{
7897   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
7898   match(Set dst (MulVS src (LoadVector mem)));
7899   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7900   ins_encode %{
7901     int vector_len = 1;
7902     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7903   %}
7904   ins_pipe( pipe_slow );
7905 %}
7906 
7907 instruct vmul16S_mem_evex(vecY dst, vecY src, memory mem) %{
7908   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
7909   match(Set dst (MulVS src (LoadVector mem)));
7910   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7911   ins_encode %{
7912     int vector_len = 1;
7913     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7914   %}
7915   ins_pipe( pipe_slow );
7916 %}
7917 
7918 instruct vmul16S_mem_evex_special(vecY dst, vecY src, memory mem) %{
7919   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
7920   match(Set dst (MulVS dst (LoadVector mem)));
7921   effect(TEMP src);
7922   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7923   ins_encode %{
7924     int vector_len = 1;
7925     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7926   %}
7927   ins_pipe( pipe_slow );
7928 %}
7929 
7930 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7931   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7932   match(Set dst (MulVS src1 src2));
7933   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
7934   ins_encode %{
7935     int vector_len = 2;
7936     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7937   %}
7938   ins_pipe( pipe_slow );
7939 %}
7940 
7941 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
7942   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7943   match(Set dst (MulVS src (LoadVector mem)));
7944   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
7945   ins_encode %{
7946     int vector_len = 2;
7947     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7948   %}
7949   ins_pipe( pipe_slow );
7950 %}
7951 
7952 // Integers vector mul (sse4_1)
7953 instruct vmul2I(vecD dst, vecD src) %{
7954   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
7955   match(Set dst (MulVI dst src));
7956   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
7957   ins_encode %{
7958     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7959   %}
7960   ins_pipe( pipe_slow );
7961 %}
7962 
7963 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
7964   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7965   match(Set dst (MulVI src1 src2));
7966   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
7967   ins_encode %{
7968     int vector_len = 0;
7969     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7970   %}
7971   ins_pipe( pipe_slow );
7972 %}
7973 
7974 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
7975   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7976   match(Set dst (MulVI src (LoadVector mem)));
7977   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
7978   ins_encode %{
7979     int vector_len = 0;
7980     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7981   %}
7982   ins_pipe( pipe_slow );
7983 %}
7984 
7985 instruct vmul4I(vecX dst, vecX src) %{
7986   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7987   match(Set dst (MulVI dst src));
7988   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
7989   ins_encode %{
7990     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7991   %}
7992   ins_pipe( pipe_slow );
7993 %}
7994 
7995 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
7996   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7997   match(Set dst (MulVI src1 src2));
7998   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
7999   ins_encode %{
8000     int vector_len = 0;
8001     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8002   %}
8003   ins_pipe( pipe_slow );
8004 %}
8005 
8006 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
8007   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8008   match(Set dst (MulVI src (LoadVector mem)));
8009   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
8010   ins_encode %{
8011     int vector_len = 0;
8012     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8013   %}
8014   ins_pipe( pipe_slow );
8015 %}
8016 
8017 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
8018   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
8019   match(Set dst (MulVL src1 src2));
8020   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
8021   ins_encode %{
8022     int vector_len = 0;
8023     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8024   %}
8025   ins_pipe( pipe_slow );
8026 %}
8027 
8028 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
8029   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
8030   match(Set dst (MulVL src (LoadVector mem)));
8031   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
8032   ins_encode %{
8033     int vector_len = 0;
8034     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8035   %}
8036   ins_pipe( pipe_slow );
8037 %}
8038 
8039 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
8040   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
8041   match(Set dst (MulVL src1 src2));
8042   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
8043   ins_encode %{
8044     int vector_len = 1;
8045     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8046   %}
8047   ins_pipe( pipe_slow );
8048 %}
8049 
8050 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
8051   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
8052   match(Set dst (MulVL src (LoadVector mem)));
8053   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
8054   ins_encode %{
8055     int vector_len = 1;
8056     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8057   %}
8058   ins_pipe( pipe_slow );
8059 %}
8060 
8061 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
8062   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
8063   match(Set dst (MulVL src1 src2));
8064   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
8065   ins_encode %{
8066     int vector_len = 2;
8067     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8068   %}
8069   ins_pipe( pipe_slow );
8070 %}
8071 
8072 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
8073   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
8074   match(Set dst (MulVL src (LoadVector mem)));
8075   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
8076   ins_encode %{
8077     int vector_len = 2;
8078     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8079   %}
8080   ins_pipe( pipe_slow );
8081 %}
8082 
8083 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
8084   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8085   match(Set dst (MulVI src1 src2));
8086   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
8087   ins_encode %{
8088     int vector_len = 1;
8089     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8090   %}
8091   ins_pipe( pipe_slow );
8092 %}
8093 
8094 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
8095   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8096   match(Set dst (MulVI src (LoadVector mem)));
8097   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
8098   ins_encode %{
8099     int vector_len = 1;
8100     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8101   %}
8102   ins_pipe( pipe_slow );
8103 %}
8104 
8105 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
8106   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8107   match(Set dst (MulVI src1 src2));
8108   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
8109   ins_encode %{
8110     int vector_len = 2;
8111     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8112   %}
8113   ins_pipe( pipe_slow );
8114 %}
8115 
8116 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
8117   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8118   match(Set dst (MulVI src (LoadVector mem)));
8119   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
8120   ins_encode %{
8121     int vector_len = 2;
8122     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8123   %}
8124   ins_pipe( pipe_slow );
8125 %}
8126 
8127 // Floats vector mul
8128 instruct vmul2F(vecD dst, vecD src) %{
8129   predicate(n->as_Vector()->length() == 2);
8130   match(Set dst (MulVF dst src));
8131   format %{ "mulps   $dst,$src\t! mul packed2F" %}
8132   ins_encode %{
8133     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
8134   %}
8135   ins_pipe( pipe_slow );
8136 %}
8137 
8138 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
8139   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8140   match(Set dst (MulVF src1 src2));
8141   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
8142   ins_encode %{
8143     int vector_len = 0;
8144     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8145   %}
8146   ins_pipe( pipe_slow );
8147 %}
8148 
8149 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
8150   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8151   match(Set dst (MulVF src (LoadVector mem)));
8152   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
8153   ins_encode %{
8154     int vector_len = 0;
8155     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8156   %}
8157   ins_pipe( pipe_slow );
8158 %}
8159 
8160 instruct vmul4F(vecX dst, vecX src) %{
8161   predicate(n->as_Vector()->length() == 4);
8162   match(Set dst (MulVF dst src));
8163   format %{ "mulps   $dst,$src\t! mul packed4F" %}
8164   ins_encode %{
8165     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
8166   %}
8167   ins_pipe( pipe_slow );
8168 %}
8169 
8170 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
8171   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8172   match(Set dst (MulVF src1 src2));
8173   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
8174   ins_encode %{
8175     int vector_len = 0;
8176     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8177   %}
8178   ins_pipe( pipe_slow );
8179 %}
8180 
8181 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
8182   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8183   match(Set dst (MulVF src (LoadVector mem)));
8184   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
8185   ins_encode %{
8186     int vector_len = 0;
8187     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8188   %}
8189   ins_pipe( pipe_slow );
8190 %}
8191 
8192 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
8193   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8194   match(Set dst (MulVF src1 src2));
8195   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
8196   ins_encode %{
8197     int vector_len = 1;
8198     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8199   %}
8200   ins_pipe( pipe_slow );
8201 %}
8202 
8203 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
8204   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8205   match(Set dst (MulVF src (LoadVector mem)));
8206   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
8207   ins_encode %{
8208     int vector_len = 1;
8209     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8210   %}
8211   ins_pipe( pipe_slow );
8212 %}
8213 
8214 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8215   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8216   match(Set dst (MulVF src1 src2));
8217   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
8218   ins_encode %{
8219     int vector_len = 2;
8220     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8221   %}
8222   ins_pipe( pipe_slow );
8223 %}
8224 
8225 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
8226   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8227   match(Set dst (MulVF src (LoadVector mem)));
8228   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
8229   ins_encode %{
8230     int vector_len = 2;
8231     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8232   %}
8233   ins_pipe( pipe_slow );
8234 %}
8235 
8236 // Doubles vector mul
8237 instruct vmul2D(vecX dst, vecX src) %{
8238   predicate(n->as_Vector()->length() == 2);
8239   match(Set dst (MulVD dst src));
8240   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
8241   ins_encode %{
8242     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
8243   %}
8244   ins_pipe( pipe_slow );
8245 %}
8246 
8247 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
8248   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8249   match(Set dst (MulVD src1 src2));
8250   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
8251   ins_encode %{
8252     int vector_len = 0;
8253     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8254   %}
8255   ins_pipe( pipe_slow );
8256 %}
8257 
8258 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
8259   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8260   match(Set dst (MulVD src (LoadVector mem)));
8261   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
8262   ins_encode %{
8263     int vector_len = 0;
8264     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8265   %}
8266   ins_pipe( pipe_slow );
8267 %}
8268 
8269 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
8270   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8271   match(Set dst (MulVD src1 src2));
8272   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
8273   ins_encode %{
8274     int vector_len = 1;
8275     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8276   %}
8277   ins_pipe( pipe_slow );
8278 %}
8279 
8280 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
8281   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8282   match(Set dst (MulVD src (LoadVector mem)));
8283   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
8284   ins_encode %{
8285     int vector_len = 1;
8286     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8287   %}
8288   ins_pipe( pipe_slow );
8289 %}
8290 
8291 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8292   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8293   match(Set dst (MulVD src1 src2));
8294   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
8295   ins_encode %{
8296     int vector_len = 2;
8297     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8298   %}
8299   ins_pipe( pipe_slow );
8300 %}
8301 
8302 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
8303   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8304   match(Set dst (MulVD src (LoadVector mem)));
8305   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
8306   ins_encode %{
8307     int vector_len = 2;
8308     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8309   %}
8310   ins_pipe( pipe_slow );
8311 %}
8312 
8313 instruct vcmov8F_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
8314   predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 8);
8315   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
8316   effect(TEMP dst, USE src1, USE src2);
8317   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
8318             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
8319          %}
8320   ins_encode %{
8321     int vector_len = 1;
8322     int cond = (Assembler::Condition)($copnd$$cmpcode);
8323     __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
8324     __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
8325   %}
8326   ins_pipe( pipe_slow );
8327 %}
8328 
8329 instruct vcmov4D_reg(vecY dst, vecY src1, vecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
8330   predicate(UseAVX > 0 && UseAVX < 3 && n->as_Vector()->length() == 4);
8331   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
8332   effect(TEMP dst, USE src1, USE src2);
8333   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
8334             "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
8335          %}
8336   ins_encode %{
8337     int vector_len = 1;
8338     int cond = (Assembler::Condition)($copnd$$cmpcode);
8339     __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
8340     __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
8341   %}
8342   ins_pipe( pipe_slow );
8343 %}
8344 
8345 // --------------------------------- DIV --------------------------------------
8346 
8347 // Floats vector div
8348 instruct vdiv2F(vecD dst, vecD src) %{
8349   predicate(n->as_Vector()->length() == 2);
8350   match(Set dst (DivVF dst src));
8351   format %{ "divps   $dst,$src\t! div packed2F" %}
8352   ins_encode %{
8353     __ divps($dst$$XMMRegister, $src$$XMMRegister);
8354   %}
8355   ins_pipe( pipe_slow );
8356 %}
8357 
8358 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
8359   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8360   match(Set dst (DivVF src1 src2));
8361   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
8362   ins_encode %{
8363     int vector_len = 0;
8364     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8365   %}
8366   ins_pipe( pipe_slow );
8367 %}
8368 
8369 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
8370   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8371   match(Set dst (DivVF src (LoadVector mem)));
8372   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
8373   ins_encode %{
8374     int vector_len = 0;
8375     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8376   %}
8377   ins_pipe( pipe_slow );
8378 %}
8379 
8380 instruct vdiv4F(vecX dst, vecX src) %{
8381   predicate(n->as_Vector()->length() == 4);
8382   match(Set dst (DivVF dst src));
8383   format %{ "divps   $dst,$src\t! div packed4F" %}
8384   ins_encode %{
8385     __ divps($dst$$XMMRegister, $src$$XMMRegister);
8386   %}
8387   ins_pipe( pipe_slow );
8388 %}
8389 
8390 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
8391   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8392   match(Set dst (DivVF src1 src2));
8393   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
8394   ins_encode %{
8395     int vector_len = 0;
8396     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8397   %}
8398   ins_pipe( pipe_slow );
8399 %}
8400 
8401 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
8402   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8403   match(Set dst (DivVF src (LoadVector mem)));
8404   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
8405   ins_encode %{
8406     int vector_len = 0;
8407     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8408   %}
8409   ins_pipe( pipe_slow );
8410 %}
8411 
8412 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
8413   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8414   match(Set dst (DivVF src1 src2));
8415   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
8416   ins_encode %{
8417     int vector_len = 1;
8418     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8419   %}
8420   ins_pipe( pipe_slow );
8421 %}
8422 
8423 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
8424   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8425   match(Set dst (DivVF src (LoadVector mem)));
8426   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
8427   ins_encode %{
8428     int vector_len = 1;
8429     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8430   %}
8431   ins_pipe( pipe_slow );
8432 %}
8433 
8434 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8435   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8436   match(Set dst (DivVF src1 src2));
8437   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
8438   ins_encode %{
8439     int vector_len = 2;
8440     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8441   %}
8442   ins_pipe( pipe_slow );
8443 %}
8444 
8445 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
8446   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8447   match(Set dst (DivVF src (LoadVector mem)));
8448   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
8449   ins_encode %{
8450     int vector_len = 2;
8451     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8452   %}
8453   ins_pipe( pipe_slow );
8454 %}
8455 
8456 // Doubles vector div
8457 instruct vdiv2D(vecX dst, vecX src) %{
8458   predicate(n->as_Vector()->length() == 2);
8459   match(Set dst (DivVD dst src));
8460   format %{ "divpd   $dst,$src\t! div packed2D" %}
8461   ins_encode %{
8462     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
8463   %}
8464   ins_pipe( pipe_slow );
8465 %}
8466 
8467 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
8468   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8469   match(Set dst (DivVD src1 src2));
8470   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
8471   ins_encode %{
8472     int vector_len = 0;
8473     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8474   %}
8475   ins_pipe( pipe_slow );
8476 %}
8477 
8478 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
8479   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8480   match(Set dst (DivVD src (LoadVector mem)));
8481   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
8482   ins_encode %{
8483     int vector_len = 0;
8484     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8485   %}
8486   ins_pipe( pipe_slow );
8487 %}
8488 
8489 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
8490   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8491   match(Set dst (DivVD src1 src2));
8492   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
8493   ins_encode %{
8494     int vector_len = 1;
8495     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8496   %}
8497   ins_pipe( pipe_slow );
8498 %}
8499 
8500 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
8501   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8502   match(Set dst (DivVD src (LoadVector mem)));
8503   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
8504   ins_encode %{
8505     int vector_len = 1;
8506     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8507   %}
8508   ins_pipe( pipe_slow );
8509 %}
8510 
8511 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8512   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8513   match(Set dst (DivVD src1 src2));
8514   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
8515   ins_encode %{
8516     int vector_len = 2;
8517     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8518   %}
8519   ins_pipe( pipe_slow );
8520 %}
8521 
8522 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
8523   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8524   match(Set dst (DivVD src (LoadVector mem)));
8525   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
8526   ins_encode %{
8527     int vector_len = 2;
8528     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8529   %}
8530   ins_pipe( pipe_slow );
8531 %}
8532 
8533 // ------------------------------ Shift ---------------------------------------
8534 
8535 // Left and right shift count vectors are the same on x86
8536 // (only lowest bits of xmm reg are used for count).
8537 instruct vshiftcnt(vecS dst, rRegI cnt) %{
8538   match(Set dst (LShiftCntV cnt));
8539   match(Set dst (RShiftCntV cnt));
8540   format %{ "movd    $dst,$cnt\t! load shift count" %}
8541   ins_encode %{
8542     __ movdl($dst$$XMMRegister, $cnt$$Register);
8543   %}
8544   ins_pipe( pipe_slow );
8545 %}
8546 
8547 // --------------------------------- Sqrt --------------------------------------
8548 
8549 // Floating point vector sqrt
8550 instruct vsqrt2D_reg(vecX dst, vecX src) %{
8551   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8552   match(Set dst (SqrtVD src));
8553   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
8554   ins_encode %{
8555     int vector_len = 0;
8556     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8557   %}
8558   ins_pipe( pipe_slow );
8559 %}
8560 
8561 instruct vsqrt2D_mem(vecX dst, memory mem) %{
8562   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8563   match(Set dst (SqrtVD (LoadVector mem)));
8564   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
8565   ins_encode %{
8566     int vector_len = 0;
8567     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8568   %}
8569   ins_pipe( pipe_slow );
8570 %}
8571 
8572 instruct vsqrt4D_reg(vecY dst, vecY src) %{
8573   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8574   match(Set dst (SqrtVD src));
8575   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
8576   ins_encode %{
8577     int vector_len = 1;
8578     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8579   %}
8580   ins_pipe( pipe_slow );
8581 %}
8582 
8583 instruct vsqrt4D_mem(vecY dst, memory mem) %{
8584   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8585   match(Set dst (SqrtVD (LoadVector mem)));
8586   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
8587   ins_encode %{
8588     int vector_len = 1;
8589     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8590   %}
8591   ins_pipe( pipe_slow );
8592 %}
8593 
8594 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
8595   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8596   match(Set dst (SqrtVD src));
8597   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
8598   ins_encode %{
8599     int vector_len = 2;
8600     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8601   %}
8602   ins_pipe( pipe_slow );
8603 %}
8604 
8605 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
8606   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8607   match(Set dst (SqrtVD (LoadVector mem)));
8608   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
8609   ins_encode %{
8610     int vector_len = 2;
8611     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8612   %}
8613   ins_pipe( pipe_slow );
8614 %}
8615 
8616 instruct vsqrt2F_reg(vecD dst, vecD src) %{
8617   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8618   match(Set dst (SqrtVF src));
8619   format %{ "vsqrtps  $dst,$src\t! sqrt packed2F" %}
8620   ins_encode %{
8621     int vector_len = 0;
8622     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8623   %}
8624   ins_pipe( pipe_slow );
8625 %}
8626 
8627 instruct vsqrt2F_mem(vecD dst, memory mem) %{
8628   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8629   match(Set dst (SqrtVF (LoadVector mem)));
8630   format %{ "vsqrtps  $dst,$mem\t! sqrt packed2F" %}
8631   ins_encode %{
8632     int vector_len = 0;
8633     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8634   %}
8635   ins_pipe( pipe_slow );
8636 %}
8637 
8638 instruct vsqrt4F_reg(vecX dst, vecX src) %{
8639   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8640   match(Set dst (SqrtVF src));
8641   format %{ "vsqrtps  $dst,$src\t! sqrt packed4F" %}
8642   ins_encode %{
8643     int vector_len = 0;
8644     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8645   %}
8646   ins_pipe( pipe_slow );
8647 %}
8648 
8649 instruct vsqrt4F_mem(vecX dst, memory mem) %{
8650   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8651   match(Set dst (SqrtVF (LoadVector mem)));
8652   format %{ "vsqrtps  $dst,$mem\t! sqrt packed4F" %}
8653   ins_encode %{
8654     int vector_len = 0;
8655     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8656   %}
8657   ins_pipe( pipe_slow );
8658 %}
8659 
8660 instruct vsqrt8F_reg(vecY dst, vecY src) %{
8661   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8662   match(Set dst (SqrtVF src));
8663   format %{ "vsqrtps  $dst,$src\t! sqrt packed8F" %}
8664   ins_encode %{
8665     int vector_len = 1;
8666     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8667   %}
8668   ins_pipe( pipe_slow );
8669 %}
8670 
8671 instruct vsqrt8F_mem(vecY dst, memory mem) %{
8672   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8673   match(Set dst (SqrtVF (LoadVector mem)));
8674   format %{ "vsqrtps  $dst,$mem\t! sqrt packed8F" %}
8675   ins_encode %{
8676     int vector_len = 1;
8677     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8678   %}
8679   ins_pipe( pipe_slow );
8680 %}
8681 
8682 instruct vsqrt16F_reg(vecZ dst, vecZ src) %{
8683   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8684   match(Set dst (SqrtVF src));
8685   format %{ "vsqrtps  $dst,$src\t! sqrt packed16F" %}
8686   ins_encode %{
8687     int vector_len = 2;
8688     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8689   %}
8690   ins_pipe( pipe_slow );
8691 %}
8692 
8693 instruct vsqrt16F_mem(vecZ dst, memory mem) %{
8694   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8695   match(Set dst (SqrtVF (LoadVector mem)));
8696   format %{ "vsqrtps  $dst,$mem\t! sqrt packed16F" %}
8697   ins_encode %{
8698     int vector_len = 2;
8699     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8700   %}
8701   ins_pipe( pipe_slow );
8702 %}
8703 
8704 // ------------------------------ LeftShift -----------------------------------
8705 
8706 // Shorts/Chars vector left shift
8707 instruct vsll2S(vecS dst, vecS shift) %{
8708   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8709   match(Set dst (LShiftVS dst shift));
8710   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8711   ins_encode %{
8712     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8713   %}
8714   ins_pipe( pipe_slow );
8715 %}
8716 
8717 instruct vsll2S_imm(vecS dst, immI8 shift) %{
8718   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8719   match(Set dst (LShiftVS dst shift));
8720   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
8721   ins_encode %{
8722     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8723   %}
8724   ins_pipe( pipe_slow );
8725 %}
8726 
8727 instruct vsll2S_reg_avx(vecS dst, vecS src, vecS shift) %{
8728   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
8729   match(Set dst (LShiftVS src shift));
8730   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8731   ins_encode %{
8732     int vector_len = 0;
8733     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8734   %}
8735   ins_pipe( pipe_slow );
8736 %}
8737 
8738 instruct vsll2S_reg_evex(vecS dst, vecS src, vecS shift) %{
8739   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
8740   match(Set dst (LShiftVS src shift));
8741   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8742   ins_encode %{
8743     int vector_len = 0;
8744     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8745   %}
8746   ins_pipe( pipe_slow );
8747 %}
8748 
8749 instruct vsll2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
8750   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
8751   match(Set dst (LShiftVS dst shift));
8752   effect(TEMP src);
8753   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8754   ins_encode %{
8755     int vector_len = 0;
8756     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8757   %}
8758   ins_pipe( pipe_slow );
8759 %}
8760 
8761 instruct vsll2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
8762   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
8763   match(Set dst (LShiftVS src shift));
8764   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8765   ins_encode %{
8766     int vector_len = 0;
8767     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8768   %}
8769   ins_pipe( pipe_slow );
8770 %}
8771 
8772 instruct vsll2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
8773   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
8774   match(Set dst (LShiftVS src shift));
8775   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8776   ins_encode %{
8777     int vector_len = 0;
8778     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8779   %}
8780   ins_pipe( pipe_slow );
8781 %}
8782 
8783 instruct vsll2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
8784   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
8785   match(Set dst (LShiftVS dst shift));
8786   effect(TEMP src);
8787   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
8788   ins_encode %{
8789     int vector_len = 0;
8790     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8791   %}
8792   ins_pipe( pipe_slow );
8793 %}
8794 
8795 instruct vsll4S(vecD dst, vecS shift) %{
8796   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8797   match(Set dst (LShiftVS dst shift));
8798   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8799   ins_encode %{
8800     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8801   %}
8802   ins_pipe( pipe_slow );
8803 %}
8804 
8805 instruct vsll4S_imm(vecD dst, immI8 shift) %{
8806   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8807   match(Set dst (LShiftVS dst shift));
8808   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
8809   ins_encode %{
8810     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8811   %}
8812   ins_pipe( pipe_slow );
8813 %}
8814 
8815 instruct vsll4S_reg_avx(vecD dst, vecD src, vecS shift) %{
8816   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
8817   match(Set dst (LShiftVS src shift));
8818   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8819   ins_encode %{
8820     int vector_len = 0;
8821     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8822   %}
8823   ins_pipe( pipe_slow );
8824 %}
8825 
8826 instruct vsll4S_reg_evex(vecD dst, vecD src, vecS shift) %{
8827   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
8828   match(Set dst (LShiftVS src shift));
8829   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8830   ins_encode %{
8831     int vector_len = 0;
8832     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8833   %}
8834   ins_pipe( pipe_slow );
8835 %}
8836 
8837 instruct vsll4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
8838   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
8839   match(Set dst (LShiftVS dst shift));
8840   effect(TEMP src);
8841   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8842   ins_encode %{
8843     int vector_len = 0;
8844     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8845   %}
8846   ins_pipe( pipe_slow );
8847 %}
8848 
8849 instruct vsll4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
8850   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
8851   match(Set dst (LShiftVS src shift));
8852   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8853   ins_encode %{
8854     int vector_len = 0;
8855     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8856   %}
8857   ins_pipe( pipe_slow );
8858 %}
8859 
8860 instruct vsll4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
8861   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
8862   match(Set dst (LShiftVS src shift));
8863   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8864   ins_encode %{
8865     int vector_len = 0;
8866     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8867   %}
8868   ins_pipe( pipe_slow );
8869 %}
8870 
8871 instruct vsll4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
8872   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
8873   match(Set dst (LShiftVS dst shift));
8874   effect(TEMP src);
8875   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
8876   ins_encode %{
8877     int vector_len = 0;
8878     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8879   %}
8880   ins_pipe( pipe_slow );
8881 %}
8882 
8883 instruct vsll8S(vecX dst, vecS shift) %{
8884   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8885   match(Set dst (LShiftVS dst shift));
8886   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8887   ins_encode %{
8888     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
8889   %}
8890   ins_pipe( pipe_slow );
8891 %}
8892 
8893 instruct vsll8S_imm(vecX dst, immI8 shift) %{
8894   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
8895   match(Set dst (LShiftVS dst shift));
8896   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
8897   ins_encode %{
8898     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
8899   %}
8900   ins_pipe( pipe_slow );
8901 %}
8902 
8903 instruct vsll8S_reg_avx(vecX dst, vecX src, vecS shift) %{
8904   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
8905   match(Set dst (LShiftVS src shift));
8906   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8907   ins_encode %{
8908     int vector_len = 0;
8909     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8910   %}
8911   ins_pipe( pipe_slow );
8912 %}
8913 
8914 instruct vsll8S_reg_evex(vecX dst, vecX src, vecS shift) %{
8915   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
8916   match(Set dst (LShiftVS src shift));
8917   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8918   ins_encode %{
8919     int vector_len = 0;
8920     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8921   %}
8922   ins_pipe( pipe_slow );
8923 %}
8924 
8925 instruct vsll8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
8926   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
8927   match(Set dst (LShiftVS dst shift));
8928   effect(TEMP src);
8929   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8930   ins_encode %{
8931     int vector_len = 0;
8932     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8933   %}
8934   ins_pipe( pipe_slow );
8935 %}
8936 
8937 instruct vsll8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
8938   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
8939   match(Set dst (LShiftVS src shift));
8940   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8941   ins_encode %{
8942     int vector_len = 0;
8943     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8944   %}
8945   ins_pipe( pipe_slow );
8946 %}
8947 
8948 instruct vsll8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
8949   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
8950   match(Set dst (LShiftVS src shift));
8951   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8952   ins_encode %{
8953     int vector_len = 0;
8954     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8955   %}
8956   ins_pipe( pipe_slow );
8957 %}
8958 
8959 instruct vsll8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
8960   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
8961   match(Set dst (LShiftVS dst shift));
8962   effect(TEMP src);
8963   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
8964   ins_encode %{
8965     int vector_len = 0;
8966     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
8967   %}
8968   ins_pipe( pipe_slow );
8969 %}
8970 
8971 instruct vsll16S_reg_avx(vecY dst, vecY src, vecS shift) %{
8972   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
8973   match(Set dst (LShiftVS src shift));
8974   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8975   ins_encode %{
8976     int vector_len = 1;
8977     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8978   %}
8979   ins_pipe( pipe_slow );
8980 %}
8981 
8982 instruct vsll16S_reg_evex(vecY dst, vecY src, vecS shift) %{
8983   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
8984   match(Set dst (LShiftVS src shift));
8985   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8986   ins_encode %{
8987     int vector_len = 1;
8988     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8989   %}
8990   ins_pipe( pipe_slow );
8991 %}
8992 
8993 instruct vsll16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
8994   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
8995   match(Set dst (LShiftVS dst shift));
8996   effect(TEMP src);
8997   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
8998   ins_encode %{
8999     int vector_len = 1;
9000     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9001   %}
9002   ins_pipe( pipe_slow );
9003 %}
9004 
9005 instruct vsll16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
9006   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9007   match(Set dst (LShiftVS src shift));
9008   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9009   ins_encode %{
9010     int vector_len = 1;
9011     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9012   %}
9013   ins_pipe( pipe_slow );
9014 %}
9015 
9016 instruct vsll16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
9017   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9018   match(Set dst (LShiftVS src shift));
9019   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9020   ins_encode %{
9021     int vector_len = 1;
9022     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9023   %}
9024   ins_pipe( pipe_slow );
9025 %}
9026 
9027 instruct vsll16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
9028   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9029   match(Set dst (LShiftVS dst shift));
9030   effect(TEMP src);
9031   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
9032   ins_encode %{
9033     int vector_len = 1;
9034     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9035   %}
9036   ins_pipe( pipe_slow );
9037 %}
9038 
9039 instruct vsll32S_reg(vecZ dst, vecZ src, vecS shift) %{
9040   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9041   match(Set dst (LShiftVS src shift));
9042   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
9043   ins_encode %{
9044     int vector_len = 2;
9045     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9046   %}
9047   ins_pipe( pipe_slow );
9048 %}
9049 
9050 instruct vsll32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9051   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9052   match(Set dst (LShiftVS src shift));
9053   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed32S" %}
9054   ins_encode %{
9055     int vector_len = 2;
9056     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9057   %}
9058   ins_pipe( pipe_slow );
9059 %}
9060 
9061 // Integers vector left shift
9062 instruct vsll2I(vecD dst, vecS shift) %{
9063   predicate(n->as_Vector()->length() == 2);
9064   match(Set dst (LShiftVI dst shift));
9065   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
9066   ins_encode %{
9067     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
9068   %}
9069   ins_pipe( pipe_slow );
9070 %}
9071 
9072 instruct vsll2I_imm(vecD dst, immI8 shift) %{
9073   predicate(n->as_Vector()->length() == 2);
9074   match(Set dst (LShiftVI dst shift));
9075   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
9076   ins_encode %{
9077     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
9078   %}
9079   ins_pipe( pipe_slow );
9080 %}
9081 
9082 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
9083   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9084   match(Set dst (LShiftVI src shift));
9085   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
9086   ins_encode %{
9087     int vector_len = 0;
9088     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9089   %}
9090   ins_pipe( pipe_slow );
9091 %}
9092 
9093 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
9094   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9095   match(Set dst (LShiftVI src shift));
9096   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
9097   ins_encode %{
9098     int vector_len = 0;
9099     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9100   %}
9101   ins_pipe( pipe_slow );
9102 %}
9103 
9104 instruct vsll4I(vecX dst, vecS shift) %{
9105   predicate(n->as_Vector()->length() == 4);
9106   match(Set dst (LShiftVI dst shift));
9107   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
9108   ins_encode %{
9109     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
9110   %}
9111   ins_pipe( pipe_slow );
9112 %}
9113 
9114 instruct vsll4I_imm(vecX dst, immI8 shift) %{
9115   predicate(n->as_Vector()->length() == 4);
9116   match(Set dst (LShiftVI dst shift));
9117   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
9118   ins_encode %{
9119     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
9120   %}
9121   ins_pipe( pipe_slow );
9122 %}
9123 
9124 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
9125   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9126   match(Set dst (LShiftVI src shift));
9127   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
9128   ins_encode %{
9129     int vector_len = 0;
9130     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9131   %}
9132   ins_pipe( pipe_slow );
9133 %}
9134 
9135 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
9136   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9137   match(Set dst (LShiftVI src shift));
9138   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
9139   ins_encode %{
9140     int vector_len = 0;
9141     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9142   %}
9143   ins_pipe( pipe_slow );
9144 %}
9145 
9146 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
9147   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9148   match(Set dst (LShiftVI src shift));
9149   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
9150   ins_encode %{
9151     int vector_len = 1;
9152     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9153   %}
9154   ins_pipe( pipe_slow );
9155 %}
9156 
9157 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
9158   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9159   match(Set dst (LShiftVI src shift));
9160   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
9161   ins_encode %{
9162     int vector_len = 1;
9163     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9164   %}
9165   ins_pipe( pipe_slow );
9166 %}
9167 
9168 instruct vsll16I_reg(vecZ dst, vecZ src, vecS shift) %{
9169   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9170   match(Set dst (LShiftVI src shift));
9171   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
9172   ins_encode %{
9173     int vector_len = 2;
9174     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9175   %}
9176   ins_pipe( pipe_slow );
9177 %}
9178 
9179 instruct vsll16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9180   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9181   match(Set dst (LShiftVI src shift));
9182   format %{ "vpslld  $dst,$src,$shift\t! left shift packed16I" %}
9183   ins_encode %{
9184     int vector_len = 2;
9185     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9186   %}
9187   ins_pipe( pipe_slow );
9188 %}
9189 
9190 // Longs vector left shift
9191 instruct vsll2L(vecX dst, vecS shift) %{
9192   predicate(n->as_Vector()->length() == 2);
9193   match(Set dst (LShiftVL dst shift));
9194   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
9195   ins_encode %{
9196     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
9197   %}
9198   ins_pipe( pipe_slow );
9199 %}
9200 
9201 instruct vsll2L_imm(vecX dst, immI8 shift) %{
9202   predicate(n->as_Vector()->length() == 2);
9203   match(Set dst (LShiftVL dst shift));
9204   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
9205   ins_encode %{
9206     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
9207   %}
9208   ins_pipe( pipe_slow );
9209 %}
9210 
9211 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
9212   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9213   match(Set dst (LShiftVL src shift));
9214   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
9215   ins_encode %{
9216     int vector_len = 0;
9217     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9218   %}
9219   ins_pipe( pipe_slow );
9220 %}
9221 
9222 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
9223   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9224   match(Set dst (LShiftVL src shift));
9225   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
9226   ins_encode %{
9227     int vector_len = 0;
9228     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9229   %}
9230   ins_pipe( pipe_slow );
9231 %}
9232 
9233 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
9234   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9235   match(Set dst (LShiftVL src shift));
9236   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
9237   ins_encode %{
9238     int vector_len = 1;
9239     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9240   %}
9241   ins_pipe( pipe_slow );
9242 %}
9243 
9244 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
9245   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9246   match(Set dst (LShiftVL src shift));
9247   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
9248   ins_encode %{
9249     int vector_len = 1;
9250     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9251   %}
9252   ins_pipe( pipe_slow );
9253 %}
9254 
9255 instruct vsll8L_reg(vecZ dst, vecZ src, vecS shift) %{
9256   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9257   match(Set dst (LShiftVL src shift));
9258   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
9259   ins_encode %{
9260     int vector_len = 2;
9261     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9262   %}
9263   ins_pipe( pipe_slow );
9264 %}
9265 
9266 instruct vsll8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9267   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9268   match(Set dst (LShiftVL src shift));
9269   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed8L" %}
9270   ins_encode %{
9271     int vector_len = 2;
9272     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9273   %}
9274   ins_pipe( pipe_slow );
9275 %}
9276 
9277 // ----------------------- LogicalRightShift -----------------------------------
9278 
9279 // Shorts vector logical right shift produces incorrect Java result
9280 // for negative data because java code convert short value into int with
9281 // sign extension before a shift. But char vectors are fine since chars are
9282 // unsigned values.
9283 
9284 instruct vsrl2S(vecS dst, vecS shift) %{
9285   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9286   match(Set dst (URShiftVS dst shift));
9287   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
9288   ins_encode %{
9289     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9290   %}
9291   ins_pipe( pipe_slow );
9292 %}
9293 
9294 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
9295   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9296   match(Set dst (URShiftVS dst shift));
9297   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
9298   ins_encode %{
9299     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9300   %}
9301   ins_pipe( pipe_slow );
9302 %}
9303 
9304 instruct vsrl2S_reg_avx(vecS dst, vecS src, vecS shift) %{
9305   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9306   match(Set dst (URShiftVS src shift));
9307   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9308   ins_encode %{
9309     int vector_len = 0;
9310     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9311   %}
9312   ins_pipe( pipe_slow );
9313 %}
9314 
9315 instruct vsrl2S_reg_evex(vecS dst, vecS src, vecS shift) %{
9316   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9317   match(Set dst (URShiftVS src shift));
9318   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9319   ins_encode %{
9320     int vector_len = 0;
9321     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9322   %}
9323   ins_pipe( pipe_slow );
9324 %}
9325 
9326 instruct vsrl2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
9327   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9328   match(Set dst (URShiftVS dst shift));
9329   effect(TEMP src);
9330   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9331   ins_encode %{
9332     int vector_len = 0;
9333     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9334   %}
9335   ins_pipe( pipe_slow );
9336 %}
9337 
9338 instruct vsrl2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
9339   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9340   match(Set dst (URShiftVS src shift));
9341   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9342   ins_encode %{
9343     int vector_len = 0;
9344     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9345   %}
9346   ins_pipe( pipe_slow );
9347 %}
9348 
9349 instruct vsrl2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
9350   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9351   match(Set dst (URShiftVS src shift));
9352   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9353   ins_encode %{
9354     int vector_len = 0;
9355     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9356   %}
9357   ins_pipe( pipe_slow );
9358 %}
9359 
9360 instruct vsrl2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
9361   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9362   match(Set dst (URShiftVS dst shift));
9363   effect(TEMP src);
9364   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
9365   ins_encode %{
9366     int vector_len = 0;
9367     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9368   %}
9369   ins_pipe( pipe_slow );
9370 %}
9371 
9372 instruct vsrl4S(vecD dst, vecS shift) %{
9373   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9374   match(Set dst (URShiftVS dst shift));
9375   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
9376   ins_encode %{
9377     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9378   %}
9379   ins_pipe( pipe_slow );
9380 %}
9381 
9382 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
9383   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9384   match(Set dst (URShiftVS dst shift));
9385   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
9386   ins_encode %{
9387     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9388   %}
9389   ins_pipe( pipe_slow );
9390 %}
9391 
9392 instruct vsrl4S_reg_avx(vecD dst, vecD src, vecS shift) %{
9393   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9394   match(Set dst (URShiftVS src shift));
9395   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9396   ins_encode %{
9397     int vector_len = 0;
9398     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9399   %}
9400   ins_pipe( pipe_slow );
9401 %}
9402 
9403 instruct vsrl4S_reg_evex(vecD dst, vecD src, vecS shift) %{
9404   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9405   match(Set dst (URShiftVS src shift));
9406   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9407   ins_encode %{
9408     int vector_len = 0;
9409     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9410   %}
9411   ins_pipe( pipe_slow );
9412 %}
9413 
9414 instruct vsrl4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
9415   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9416   match(Set dst (URShiftVS dst shift));
9417   effect(TEMP src);
9418   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9419   ins_encode %{
9420     int vector_len = 0;
9421     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9422   %}
9423   ins_pipe( pipe_slow );
9424 %}
9425 
9426 instruct vsrl4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
9427   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9428   match(Set dst (URShiftVS src shift));
9429   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9430   ins_encode %{
9431     int vector_len = 0;
9432     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9433   %}
9434   ins_pipe( pipe_slow );
9435 %}
9436 
9437 instruct vsrl4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
9438   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9439   match(Set dst (URShiftVS src shift));
9440   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9441   ins_encode %{
9442     int vector_len = 0;
9443     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9444   %}
9445   ins_pipe( pipe_slow );
9446 %}
9447 
9448 instruct vsrl4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
9449   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9450   match(Set dst (URShiftVS dst shift));
9451   effect(TEMP src);
9452   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
9453   ins_encode %{
9454     int vector_len = 0;
9455     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9456   %}
9457   ins_pipe( pipe_slow );
9458 %}
9459 
9460 instruct vsrl8S(vecX dst, vecS shift) %{
9461   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9462   match(Set dst (URShiftVS dst shift));
9463   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
9464   ins_encode %{
9465     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
9466   %}
9467   ins_pipe( pipe_slow );
9468 %}
9469 
9470 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
9471   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
9472   match(Set dst (URShiftVS dst shift));
9473   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
9474   ins_encode %{
9475     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
9476   %}
9477   ins_pipe( pipe_slow );
9478 %}
9479 
9480 instruct vsrl8S_reg_avx(vecX dst, vecX src, vecS shift) %{
9481   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9482   match(Set dst (URShiftVS src shift));
9483   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9484   ins_encode %{
9485     int vector_len = 0;
9486     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9487   %}
9488   ins_pipe( pipe_slow );
9489 %}
9490 
9491 instruct vsrl8S_reg_evex(vecX dst, vecX src, vecS shift) %{
9492   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9493   match(Set dst (URShiftVS src shift));
9494   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9495   ins_encode %{
9496     int vector_len = 0;
9497     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9498   %}
9499   ins_pipe( pipe_slow );
9500 %}
9501 
9502 instruct vsrl8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
9503   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9504   match(Set dst (URShiftVS dst shift));
9505   effect(TEMP src);
9506   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9507   ins_encode %{
9508     int vector_len = 0;
9509     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9510   %}
9511   ins_pipe( pipe_slow );
9512 %}
9513 
9514 instruct vsrl8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
9515   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
9516   match(Set dst (URShiftVS src shift));
9517   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9518   ins_encode %{
9519     int vector_len = 0;
9520     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9521   %}
9522   ins_pipe( pipe_slow );
9523 %}
9524 
9525 instruct vsrl8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
9526   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
9527   match(Set dst (URShiftVS src shift));
9528   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9529   ins_encode %{
9530     int vector_len = 0;
9531     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9532   %}
9533   ins_pipe( pipe_slow );
9534 %}
9535 
9536 instruct vsrl8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
9537   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
9538   match(Set dst (URShiftVS dst shift));
9539   effect(TEMP src);
9540   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
9541   ins_encode %{
9542     int vector_len = 0;
9543     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9544   %}
9545   ins_pipe( pipe_slow );
9546 %}
9547 
9548 instruct vsrl16S_reg_avx(vecY dst, vecY src, vecS shift) %{
9549   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9550   match(Set dst (URShiftVS src shift));
9551   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9552   ins_encode %{
9553     int vector_len = 1;
9554     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9555   %}
9556   ins_pipe( pipe_slow );
9557 %}
9558 
9559 instruct vsrl16S_reg_evex(vecY dst, vecY src, vecS shift) %{
9560   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9561   match(Set dst (URShiftVS src shift));
9562   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9563   ins_encode %{
9564     int vector_len = 1;
9565     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9566   %}
9567   ins_pipe( pipe_slow );
9568 %}
9569 
9570 instruct vsrl16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
9571   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9572   match(Set dst (URShiftVS dst shift));
9573   effect(TEMP src);
9574   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9575   ins_encode %{
9576     int vector_len = 1;
9577     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9578   %}
9579   ins_pipe( pipe_slow );
9580 %}
9581 
9582 instruct vsrl16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
9583   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
9584   match(Set dst (URShiftVS src shift));
9585   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9586   ins_encode %{
9587     int vector_len = 1;
9588     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9589   %}
9590   ins_pipe( pipe_slow );
9591 %}
9592 
9593 instruct vsrl16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
9594   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
9595   match(Set dst (URShiftVS src shift));
9596   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9597   ins_encode %{
9598     int vector_len = 1;
9599     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9600   %}
9601   ins_pipe( pipe_slow );
9602 %}
9603 
9604 instruct vsrl16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
9605   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
9606   match(Set dst (URShiftVS dst shift));
9607   effect(TEMP src);
9608   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
9609   ins_encode %{
9610     int vector_len = 1;
9611     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9612   %}
9613   ins_pipe( pipe_slow );
9614 %}
9615 
9616 instruct vsrl32S_reg(vecZ dst, vecZ src, vecS shift) %{
9617   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9618   match(Set dst (URShiftVS src shift));
9619   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
9620   ins_encode %{
9621     int vector_len = 2;
9622     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9623   %}
9624   ins_pipe( pipe_slow );
9625 %}
9626 
9627 instruct vsrl32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9628   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
9629   match(Set dst (URShiftVS src shift));
9630   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed32S" %}
9631   ins_encode %{
9632     int vector_len = 2;
9633     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9634   %}
9635   ins_pipe( pipe_slow );
9636 %}
9637 
9638 // Integers vector logical right shift
9639 instruct vsrl2I(vecD dst, vecS shift) %{
9640   predicate(n->as_Vector()->length() == 2);
9641   match(Set dst (URShiftVI dst shift));
9642   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
9643   ins_encode %{
9644     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
9645   %}
9646   ins_pipe( pipe_slow );
9647 %}
9648 
9649 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
9650   predicate(n->as_Vector()->length() == 2);
9651   match(Set dst (URShiftVI dst shift));
9652   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
9653   ins_encode %{
9654     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
9655   %}
9656   ins_pipe( pipe_slow );
9657 %}
9658 
9659 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
9660   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9661   match(Set dst (URShiftVI src shift));
9662   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
9663   ins_encode %{
9664     int vector_len = 0;
9665     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9666   %}
9667   ins_pipe( pipe_slow );
9668 %}
9669 
9670 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
9671   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9672   match(Set dst (URShiftVI src shift));
9673   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
9674   ins_encode %{
9675     int vector_len = 0;
9676     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9677   %}
9678   ins_pipe( pipe_slow );
9679 %}
9680 
9681 instruct vsrl4I(vecX dst, vecS shift) %{
9682   predicate(n->as_Vector()->length() == 4);
9683   match(Set dst (URShiftVI dst shift));
9684   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
9685   ins_encode %{
9686     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
9687   %}
9688   ins_pipe( pipe_slow );
9689 %}
9690 
9691 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
9692   predicate(n->as_Vector()->length() == 4);
9693   match(Set dst (URShiftVI dst shift));
9694   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
9695   ins_encode %{
9696     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
9697   %}
9698   ins_pipe( pipe_slow );
9699 %}
9700 
9701 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
9702   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9703   match(Set dst (URShiftVI src shift));
9704   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
9705   ins_encode %{
9706     int vector_len = 0;
9707     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9708   %}
9709   ins_pipe( pipe_slow );
9710 %}
9711 
9712 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
9713   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9714   match(Set dst (URShiftVI src shift));
9715   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
9716   ins_encode %{
9717     int vector_len = 0;
9718     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9719   %}
9720   ins_pipe( pipe_slow );
9721 %}
9722 
9723 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
9724   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9725   match(Set dst (URShiftVI src shift));
9726   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
9727   ins_encode %{
9728     int vector_len = 1;
9729     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9730   %}
9731   ins_pipe( pipe_slow );
9732 %}
9733 
9734 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
9735   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9736   match(Set dst (URShiftVI src shift));
9737   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
9738   ins_encode %{
9739     int vector_len = 1;
9740     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9741   %}
9742   ins_pipe( pipe_slow );
9743 %}
9744 
9745 instruct vsrl16I_reg(vecZ dst, vecZ src, vecS shift) %{
9746   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9747   match(Set dst (URShiftVI src shift));
9748   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
9749   ins_encode %{
9750     int vector_len = 2;
9751     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9752   %}
9753   ins_pipe( pipe_slow );
9754 %}
9755 
9756 instruct vsrl16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9757   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9758   match(Set dst (URShiftVI src shift));
9759   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed16I" %}
9760   ins_encode %{
9761     int vector_len = 2;
9762     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9763   %}
9764   ins_pipe( pipe_slow );
9765 %}
9766 
9767 // Longs vector logical right shift
9768 instruct vsrl2L(vecX dst, vecS shift) %{
9769   predicate(n->as_Vector()->length() == 2);
9770   match(Set dst (URShiftVL dst shift));
9771   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
9772   ins_encode %{
9773     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
9774   %}
9775   ins_pipe( pipe_slow );
9776 %}
9777 
9778 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
9779   predicate(n->as_Vector()->length() == 2);
9780   match(Set dst (URShiftVL dst shift));
9781   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
9782   ins_encode %{
9783     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
9784   %}
9785   ins_pipe( pipe_slow );
9786 %}
9787 
9788 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
9789   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9790   match(Set dst (URShiftVL src shift));
9791   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
9792   ins_encode %{
9793     int vector_len = 0;
9794     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9795   %}
9796   ins_pipe( pipe_slow );
9797 %}
9798 
9799 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
9800   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9801   match(Set dst (URShiftVL src shift));
9802   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
9803   ins_encode %{
9804     int vector_len = 0;
9805     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9806   %}
9807   ins_pipe( pipe_slow );
9808 %}
9809 
9810 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
9811   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9812   match(Set dst (URShiftVL src shift));
9813   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
9814   ins_encode %{
9815     int vector_len = 1;
9816     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9817   %}
9818   ins_pipe( pipe_slow );
9819 %}
9820 
9821 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
9822   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
9823   match(Set dst (URShiftVL src shift));
9824   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
9825   ins_encode %{
9826     int vector_len = 1;
9827     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9828   %}
9829   ins_pipe( pipe_slow );
9830 %}
9831 
9832 instruct vsrl8L_reg(vecZ dst, vecZ src, vecS shift) %{
9833   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9834   match(Set dst (URShiftVL src shift));
9835   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
9836   ins_encode %{
9837     int vector_len = 2;
9838     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9839   %}
9840   ins_pipe( pipe_slow );
9841 %}
9842 
9843 instruct vsrl8L_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
9844   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9845   match(Set dst (URShiftVL src shift));
9846   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed8L" %}
9847   ins_encode %{
9848     int vector_len = 2;
9849     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9850   %}
9851   ins_pipe( pipe_slow );
9852 %}
9853 
9854 // ------------------- ArithmeticRightShift -----------------------------------
9855 
9856 // Shorts/Chars vector arithmetic right shift
9857 instruct vsra2S(vecS dst, vecS shift) %{
9858   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
9859   match(Set dst (RShiftVS dst shift));
9860   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9861   ins_encode %{
9862     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9863   %}
9864   ins_pipe( pipe_slow );
9865 %}
9866 
9867 instruct vsra2S_imm(vecS dst, immI8 shift) %{
9868   predicate(n->as_Vector()->length() == 2);
9869   match(Set dst (RShiftVS dst shift));
9870   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
9871   ins_encode %{
9872     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9873   %}
9874   ins_pipe( pipe_slow );
9875 %}
9876 
9877 instruct vsra2S_reg_avx(vecS dst, vecS src, vecS shift) %{
9878   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9879   match(Set dst (RShiftVS src shift));
9880   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9881   ins_encode %{
9882     int vector_len = 0;
9883     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9884   %}
9885   ins_pipe( pipe_slow );
9886 %}
9887 
9888 instruct vsra2S_reg_evex(vecS dst, vecS src, vecS shift) %{
9889   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9890   match(Set dst (RShiftVS src shift));
9891   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9892   ins_encode %{
9893     int vector_len = 0;
9894     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9895   %}
9896   ins_pipe( pipe_slow );
9897 %}
9898 
9899 instruct vsra2S_reg_evex_special(vecS dst, vecS src, vecS shift) %{
9900   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9901   match(Set dst (RShiftVS dst shift));
9902   effect(TEMP src);
9903   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9904   ins_encode %{
9905     int vector_len = 0;
9906     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9907   %}
9908   ins_pipe( pipe_slow );
9909 %}
9910 
9911 instruct vsra2S_reg_imm_avx(vecS dst, vecS src, immI8 shift) %{
9912   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 2);
9913   match(Set dst (RShiftVS src shift));
9914   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9915   ins_encode %{
9916     int vector_len = 0;
9917     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9918   %}
9919   ins_pipe( pipe_slow );
9920 %}
9921 
9922 instruct vsra2S_reg_imm_evex(vecS dst, vecS src, immI8 shift) %{
9923   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 2);
9924   match(Set dst (RShiftVS src shift));
9925   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9926   ins_encode %{
9927     int vector_len = 0;
9928     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9929   %}
9930   ins_pipe( pipe_slow );
9931 %}
9932 
9933 instruct vsra2S_reg_imm_evex_special(vecS dst, vecS src, immI8 shift) %{
9934   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 2);
9935   match(Set dst (RShiftVS dst shift));
9936   effect(TEMP src);
9937   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
9938   ins_encode %{
9939     int vector_len = 0;
9940     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
9941   %}
9942   ins_pipe( pipe_slow );
9943 %}
9944 
9945 instruct vsra4S(vecD dst, vecS shift) %{
9946   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9947   match(Set dst (RShiftVS dst shift));
9948   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9949   ins_encode %{
9950     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
9951   %}
9952   ins_pipe( pipe_slow );
9953 %}
9954 
9955 instruct vsra4S_imm(vecD dst, immI8 shift) %{
9956   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
9957   match(Set dst (RShiftVS dst shift));
9958   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
9959   ins_encode %{
9960     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
9961   %}
9962   ins_pipe( pipe_slow );
9963 %}
9964 
9965 instruct vsra4S_reg_avx(vecD dst, vecD src, vecS shift) %{
9966   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
9967   match(Set dst (RShiftVS src shift));
9968   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9969   ins_encode %{
9970     int vector_len = 0;
9971     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9972   %}
9973   ins_pipe( pipe_slow );
9974 %}
9975 
9976 instruct vsra4S_reg_evex(vecD dst, vecD src, vecS shift) %{
9977   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
9978   match(Set dst (RShiftVS src shift));
9979   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9980   ins_encode %{
9981     int vector_len = 0;
9982     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9983   %}
9984   ins_pipe( pipe_slow );
9985 %}
9986 
9987 instruct vsra4S_reg_evex_special(vecD dst, vecD src, vecS shift) %{
9988   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
9989   match(Set dst (RShiftVS dst shift));
9990   effect(TEMP src);
9991   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
9992   ins_encode %{
9993     int vector_len = 0;
9994     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
9995   %}
9996   ins_pipe( pipe_slow );
9997 %}
9998 
9999 instruct vsra4S_reg_imm_avx(vecD dst, vecD src, immI8 shift) %{
10000   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 4);
10001   match(Set dst (RShiftVS src shift));
10002   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10003   ins_encode %{
10004     int vector_len = 0;
10005     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10006   %}
10007   ins_pipe( pipe_slow );
10008 %}
10009 
10010 instruct vsra4S_reg_imm_evex(vecD dst, vecD src, immI8 shift) %{
10011   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 4);
10012   match(Set dst (RShiftVS src shift));
10013   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10014   ins_encode %{
10015     int vector_len = 0;
10016     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10017   %}
10018   ins_pipe( pipe_slow );
10019 %}
10020 
10021 instruct vsra4S_reg_imm_evex_special(vecD dst, vecD src, immI8 shift) %{
10022   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 4);
10023   match(Set dst (RShiftVS dst shift));
10024   effect(TEMP src);
10025   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
10026   ins_encode %{
10027     int vector_len = 0;
10028     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10029   %}
10030   ins_pipe( pipe_slow );
10031 %}
10032 
10033 instruct vsra8S(vecX dst, vecS shift) %{
10034   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
10035   match(Set dst (RShiftVS dst shift));
10036   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
10037   ins_encode %{
10038     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
10039   %}
10040   ins_pipe( pipe_slow );
10041 %}
10042 
10043 instruct vsra8S_imm(vecX dst, immI8 shift) %{
10044   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
10045   match(Set dst (RShiftVS dst shift));
10046   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
10047   ins_encode %{
10048     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
10049   %}
10050   ins_pipe( pipe_slow );
10051 %}
10052 
10053 instruct vsra8S_reg_avx(vecX dst, vecX src, vecS shift) %{
10054   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
10055   match(Set dst (RShiftVS src shift));
10056   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10057   ins_encode %{
10058     int vector_len = 0;
10059     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10060   %}
10061   ins_pipe( pipe_slow );
10062 %}
10063 
10064 instruct vsra8S_reg_evex(vecX dst, vecX src, vecS shift) %{
10065   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
10066   match(Set dst (RShiftVS src shift));
10067   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10068   ins_encode %{
10069     int vector_len = 0;
10070     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10071   %}
10072   ins_pipe( pipe_slow );
10073 %}
10074 
10075 instruct vsra8S_reg_evex_special(vecX dst, vecX src, vecS shift) %{
10076   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
10077   match(Set dst (RShiftVS dst shift));
10078   effect(TEMP src);
10079   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10080   ins_encode %{
10081     int vector_len = 0;
10082     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10083   %}
10084   ins_pipe( pipe_slow );
10085 %}
10086 
10087 instruct vsra8S_reg_imm_avx(vecX dst, vecX src, immI8 shift) %{
10088   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 8);
10089   match(Set dst (RShiftVS src shift));
10090   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10091   ins_encode %{
10092     int vector_len = 0;
10093     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10094   %}
10095   ins_pipe( pipe_slow );
10096 %}
10097 
10098 instruct vsra8S_reg_imm_evex(vecX dst, vecX src, immI8 shift) %{
10099   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8);
10100   match(Set dst (RShiftVS src shift));
10101   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10102   ins_encode %{
10103     int vector_len = 0;
10104     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10105   %}
10106   ins_pipe( pipe_slow );
10107 %}
10108 
10109 instruct vsra8S_reg_imm_evex_special(vecX dst, vecX src, immI8 shift) %{
10110   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 8);
10111   match(Set dst (RShiftVS dst shift));
10112   effect(TEMP src);
10113   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
10114   ins_encode %{
10115     int vector_len = 0;
10116     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10117   %}
10118   ins_pipe( pipe_slow );
10119 %}
10120 
10121 instruct vsra16S_reg_avx(vecY dst, vecY src, vecS shift) %{
10122   predicate(VM_Version::supports_avx256only() && n->as_Vector()->length() == 16);
10123   match(Set dst (RShiftVS src shift));
10124   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10125   ins_encode %{
10126     int vector_len = 1;
10127     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10128   %}
10129   ins_pipe( pipe_slow );
10130 %}
10131 
10132 instruct vsra16S_reg_evex(vecY dst, vecY src, vecS shift) %{
10133   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
10134   match(Set dst (RShiftVS src shift));
10135   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10136   ins_encode %{
10137     int vector_len = 1;
10138     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10139   %}
10140   ins_pipe( pipe_slow );
10141 %}
10142 
10143 instruct vsra16S_reg_evex_special(vecY dst, vecY src, vecS shift) %{
10144   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
10145   match(Set dst (RShiftVS dst shift));
10146   effect(TEMP src);
10147   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10148   ins_encode %{
10149     int vector_len = 1;
10150     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10151   %}
10152   ins_pipe( pipe_slow );
10153 %}
10154 
10155 instruct vsra16S_reg_imm_avx(vecY dst, vecY src, immI8 shift) %{
10156   predicate(VM_Version::supports_avxonly() && n->as_Vector()->length() == 16);
10157   match(Set dst (RShiftVS src shift));
10158   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10159   ins_encode %{
10160     int vector_len = 1;
10161     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10162   %}
10163   ins_pipe( pipe_slow );
10164 %}
10165 
10166 instruct vsra16S_reg_imm_evex(vecY dst, vecY src, immI8 shift) %{
10167   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 16);
10168   match(Set dst (RShiftVS src shift));
10169   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10170   ins_encode %{
10171     int vector_len = 1;
10172     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10173   %}
10174   ins_pipe( pipe_slow );
10175 %}
10176 
10177 instruct vsra16S_reg_imm_evex_special(vecY dst, vecY src, immI8 shift) %{
10178   predicate(VM_Version::supports_avx512nobw() && n->as_Vector()->length() == 16);
10179   match(Set dst (RShiftVS dst shift));
10180   effect(TEMP src);
10181   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
10182   ins_encode %{
10183     int vector_len = 1;
10184     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10185   %}
10186   ins_pipe( pipe_slow );
10187 %}
10188 
10189 instruct vsra32S_reg(vecZ dst, vecZ src, vecS shift) %{
10190   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
10191   match(Set dst (RShiftVS src shift));
10192   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
10193   ins_encode %{
10194     int vector_len = 2;
10195     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10196   %}
10197   ins_pipe( pipe_slow );
10198 %}
10199 
10200 instruct vsra32S_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
10201   predicate(VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
10202   match(Set dst (RShiftVS src shift));
10203   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed32S" %}
10204   ins_encode %{
10205     int vector_len = 2;
10206     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10207   %}
10208   ins_pipe( pipe_slow );
10209 %}
10210 
10211 // Integers vector arithmetic right shift
10212 instruct vsra2I(vecD dst, vecS shift) %{
10213   predicate(n->as_Vector()->length() == 2);
10214   match(Set dst (RShiftVI dst shift));
10215   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
10216   ins_encode %{
10217     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
10218   %}
10219   ins_pipe( pipe_slow );
10220 %}
10221 
10222 instruct vsra2I_imm(vecD dst, immI8 shift) %{
10223   predicate(n->as_Vector()->length() == 2);
10224   match(Set dst (RShiftVI dst shift));
10225   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
10226   ins_encode %{
10227     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
10228   %}
10229   ins_pipe( pipe_slow );
10230 %}
10231 
10232 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
10233   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
10234   match(Set dst (RShiftVI src shift));
10235   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
10236   ins_encode %{
10237     int vector_len = 0;
10238     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10239   %}
10240   ins_pipe( pipe_slow );
10241 %}
10242 
10243 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
10244   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
10245   match(Set dst (RShiftVI src shift));
10246   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
10247   ins_encode %{
10248     int vector_len = 0;
10249     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10250   %}
10251   ins_pipe( pipe_slow );
10252 %}
10253 
10254 instruct vsra4I(vecX dst, vecS shift) %{
10255   predicate(n->as_Vector()->length() == 4);
10256   match(Set dst (RShiftVI dst shift));
10257   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
10258   ins_encode %{
10259     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
10260   %}
10261   ins_pipe( pipe_slow );
10262 %}
10263 
10264 instruct vsra4I_imm(vecX dst, immI8 shift) %{
10265   predicate(n->as_Vector()->length() == 4);
10266   match(Set dst (RShiftVI dst shift));
10267   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
10268   ins_encode %{
10269     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
10270   %}
10271   ins_pipe( pipe_slow );
10272 %}
10273 
10274 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
10275   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
10276   match(Set dst (RShiftVI src shift));
10277   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
10278   ins_encode %{
10279     int vector_len = 0;
10280     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10281   %}
10282   ins_pipe( pipe_slow );
10283 %}
10284 
10285 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
10286   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
10287   match(Set dst (RShiftVI src shift));
10288   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
10289   ins_encode %{
10290     int vector_len = 0;
10291     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10292   %}
10293   ins_pipe( pipe_slow );
10294 %}
10295 
10296 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
10297   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
10298   match(Set dst (RShiftVI src shift));
10299   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
10300   ins_encode %{
10301     int vector_len = 1;
10302     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10303   %}
10304   ins_pipe( pipe_slow );
10305 %}
10306 
10307 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
10308   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
10309   match(Set dst (RShiftVI src shift));
10310   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
10311   ins_encode %{
10312     int vector_len = 1;
10313     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10314   %}
10315   ins_pipe( pipe_slow );
10316 %}
10317 
10318 instruct vsra16I_reg(vecZ dst, vecZ src, vecS shift) %{
10319   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
10320   match(Set dst (RShiftVI src shift));
10321   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
10322   ins_encode %{
10323     int vector_len = 2;
10324     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
10325   %}
10326   ins_pipe( pipe_slow );
10327 %}
10328 
10329 instruct vsra16I_reg_imm(vecZ dst, vecZ src, immI8 shift) %{
10330   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
10331   match(Set dst (RShiftVI src shift));
10332   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed16I" %}
10333   ins_encode %{
10334     int vector_len = 2;
10335     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector_len);
10336   %}
10337   ins_pipe( pipe_slow );
10338 %}
10339 
10340 // There are no longs vector arithmetic right shift instructions.
10341 
10342 
10343 // --------------------------------- AND --------------------------------------
10344 
10345 instruct vand4B(vecS dst, vecS src) %{
10346   predicate(n->as_Vector()->length_in_bytes() == 4);
10347   match(Set dst (AndV dst src));
10348   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
10349   ins_encode %{
10350     __ pand($dst$$XMMRegister, $src$$XMMRegister);
10351   %}
10352   ins_pipe( pipe_slow );
10353 %}
10354 
10355 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
10356   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10357   match(Set dst (AndV src1 src2));
10358   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
10359   ins_encode %{
10360     int vector_len = 0;
10361     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10362   %}
10363   ins_pipe( pipe_slow );
10364 %}
10365 
10366 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
10367   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10368   match(Set dst (AndV src (LoadVector mem)));
10369   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
10370   ins_encode %{
10371     int vector_len = 0;
10372     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10373   %}
10374   ins_pipe( pipe_slow );
10375 %}
10376 
10377 instruct vand8B(vecD dst, vecD src) %{
10378   predicate(n->as_Vector()->length_in_bytes() == 8);
10379   match(Set dst (AndV dst src));
10380   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
10381   ins_encode %{
10382     __ pand($dst$$XMMRegister, $src$$XMMRegister);
10383   %}
10384   ins_pipe( pipe_slow );
10385 %}
10386 
10387 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
10388   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10389   match(Set dst (AndV src1 src2));
10390   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
10391   ins_encode %{
10392     int vector_len = 0;
10393     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10394   %}
10395   ins_pipe( pipe_slow );
10396 %}
10397 
10398 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
10399   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10400   match(Set dst (AndV src (LoadVector mem)));
10401   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
10402   ins_encode %{
10403     int vector_len = 0;
10404     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10405   %}
10406   ins_pipe( pipe_slow );
10407 %}
10408 
10409 instruct vand16B(vecX dst, vecX src) %{
10410   predicate(n->as_Vector()->length_in_bytes() == 16);
10411   match(Set dst (AndV dst src));
10412   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
10413   ins_encode %{
10414     __ pand($dst$$XMMRegister, $src$$XMMRegister);
10415   %}
10416   ins_pipe( pipe_slow );
10417 %}
10418 
10419 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
10420   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10421   match(Set dst (AndV src1 src2));
10422   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
10423   ins_encode %{
10424     int vector_len = 0;
10425     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10426   %}
10427   ins_pipe( pipe_slow );
10428 %}
10429 
10430 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
10431   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10432   match(Set dst (AndV src (LoadVector mem)));
10433   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
10434   ins_encode %{
10435     int vector_len = 0;
10436     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10437   %}
10438   ins_pipe( pipe_slow );
10439 %}
10440 
10441 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
10442   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10443   match(Set dst (AndV src1 src2));
10444   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
10445   ins_encode %{
10446     int vector_len = 1;
10447     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10448   %}
10449   ins_pipe( pipe_slow );
10450 %}
10451 
10452 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
10453   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10454   match(Set dst (AndV src (LoadVector mem)));
10455   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
10456   ins_encode %{
10457     int vector_len = 1;
10458     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10459   %}
10460   ins_pipe( pipe_slow );
10461 %}
10462 
10463 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
10464   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10465   match(Set dst (AndV src1 src2));
10466   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
10467   ins_encode %{
10468     int vector_len = 2;
10469     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10470   %}
10471   ins_pipe( pipe_slow );
10472 %}
10473 
10474 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
10475   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10476   match(Set dst (AndV src (LoadVector mem)));
10477   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
10478   ins_encode %{
10479     int vector_len = 2;
10480     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10481   %}
10482   ins_pipe( pipe_slow );
10483 %}
10484 
10485 // --------------------------------- OR ---------------------------------------
10486 
10487 instruct vor4B(vecS dst, vecS src) %{
10488   predicate(n->as_Vector()->length_in_bytes() == 4);
10489   match(Set dst (OrV dst src));
10490   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
10491   ins_encode %{
10492     __ por($dst$$XMMRegister, $src$$XMMRegister);
10493   %}
10494   ins_pipe( pipe_slow );
10495 %}
10496 
10497 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
10498   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10499   match(Set dst (OrV src1 src2));
10500   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
10501   ins_encode %{
10502     int vector_len = 0;
10503     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10504   %}
10505   ins_pipe( pipe_slow );
10506 %}
10507 
10508 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
10509   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10510   match(Set dst (OrV src (LoadVector mem)));
10511   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
10512   ins_encode %{
10513     int vector_len = 0;
10514     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10515   %}
10516   ins_pipe( pipe_slow );
10517 %}
10518 
10519 instruct vor8B(vecD dst, vecD src) %{
10520   predicate(n->as_Vector()->length_in_bytes() == 8);
10521   match(Set dst (OrV dst src));
10522   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
10523   ins_encode %{
10524     __ por($dst$$XMMRegister, $src$$XMMRegister);
10525   %}
10526   ins_pipe( pipe_slow );
10527 %}
10528 
10529 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
10530   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10531   match(Set dst (OrV src1 src2));
10532   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
10533   ins_encode %{
10534     int vector_len = 0;
10535     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10536   %}
10537   ins_pipe( pipe_slow );
10538 %}
10539 
10540 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
10541   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10542   match(Set dst (OrV src (LoadVector mem)));
10543   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
10544   ins_encode %{
10545     int vector_len = 0;
10546     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10547   %}
10548   ins_pipe( pipe_slow );
10549 %}
10550 
10551 instruct vor16B(vecX dst, vecX src) %{
10552   predicate(n->as_Vector()->length_in_bytes() == 16);
10553   match(Set dst (OrV dst src));
10554   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
10555   ins_encode %{
10556     __ por($dst$$XMMRegister, $src$$XMMRegister);
10557   %}
10558   ins_pipe( pipe_slow );
10559 %}
10560 
10561 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
10562   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10563   match(Set dst (OrV src1 src2));
10564   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
10565   ins_encode %{
10566     int vector_len = 0;
10567     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10568   %}
10569   ins_pipe( pipe_slow );
10570 %}
10571 
10572 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
10573   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10574   match(Set dst (OrV src (LoadVector mem)));
10575   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
10576   ins_encode %{
10577     int vector_len = 0;
10578     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10579   %}
10580   ins_pipe( pipe_slow );
10581 %}
10582 
10583 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
10584   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10585   match(Set dst (OrV src1 src2));
10586   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
10587   ins_encode %{
10588     int vector_len = 1;
10589     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10590   %}
10591   ins_pipe( pipe_slow );
10592 %}
10593 
10594 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
10595   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10596   match(Set dst (OrV src (LoadVector mem)));
10597   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
10598   ins_encode %{
10599     int vector_len = 1;
10600     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10601   %}
10602   ins_pipe( pipe_slow );
10603 %}
10604 
10605 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
10606   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10607   match(Set dst (OrV src1 src2));
10608   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
10609   ins_encode %{
10610     int vector_len = 2;
10611     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10612   %}
10613   ins_pipe( pipe_slow );
10614 %}
10615 
10616 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
10617   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10618   match(Set dst (OrV src (LoadVector mem)));
10619   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
10620   ins_encode %{
10621     int vector_len = 2;
10622     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10623   %}
10624   ins_pipe( pipe_slow );
10625 %}
10626 
10627 // --------------------------------- XOR --------------------------------------
10628 
10629 instruct vxor4B(vecS dst, vecS src) %{
10630   predicate(n->as_Vector()->length_in_bytes() == 4);
10631   match(Set dst (XorV dst src));
10632   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
10633   ins_encode %{
10634     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
10635   %}
10636   ins_pipe( pipe_slow );
10637 %}
10638 
10639 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
10640   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10641   match(Set dst (XorV src1 src2));
10642   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
10643   ins_encode %{
10644     int vector_len = 0;
10645     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10646   %}
10647   ins_pipe( pipe_slow );
10648 %}
10649 
10650 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
10651   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
10652   match(Set dst (XorV src (LoadVector mem)));
10653   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
10654   ins_encode %{
10655     int vector_len = 0;
10656     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10657   %}
10658   ins_pipe( pipe_slow );
10659 %}
10660 
10661 instruct vxor8B(vecD dst, vecD src) %{
10662   predicate(n->as_Vector()->length_in_bytes() == 8);
10663   match(Set dst (XorV dst src));
10664   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
10665   ins_encode %{
10666     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
10667   %}
10668   ins_pipe( pipe_slow );
10669 %}
10670 
10671 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
10672   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10673   match(Set dst (XorV src1 src2));
10674   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
10675   ins_encode %{
10676     int vector_len = 0;
10677     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10678   %}
10679   ins_pipe( pipe_slow );
10680 %}
10681 
10682 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
10683   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
10684   match(Set dst (XorV src (LoadVector mem)));
10685   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
10686   ins_encode %{
10687     int vector_len = 0;
10688     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10689   %}
10690   ins_pipe( pipe_slow );
10691 %}
10692 
10693 instruct vxor16B(vecX dst, vecX src) %{
10694   predicate(n->as_Vector()->length_in_bytes() == 16);
10695   match(Set dst (XorV dst src));
10696   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
10697   ins_encode %{
10698     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
10699   %}
10700   ins_pipe( pipe_slow );
10701 %}
10702 
10703 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
10704   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10705   match(Set dst (XorV src1 src2));
10706   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
10707   ins_encode %{
10708     int vector_len = 0;
10709     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10710   %}
10711   ins_pipe( pipe_slow );
10712 %}
10713 
10714 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
10715   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
10716   match(Set dst (XorV src (LoadVector mem)));
10717   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
10718   ins_encode %{
10719     int vector_len = 0;
10720     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10721   %}
10722   ins_pipe( pipe_slow );
10723 %}
10724 
10725 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
10726   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10727   match(Set dst (XorV src1 src2));
10728   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
10729   ins_encode %{
10730     int vector_len = 1;
10731     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10732   %}
10733   ins_pipe( pipe_slow );
10734 %}
10735 
10736 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
10737   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
10738   match(Set dst (XorV src (LoadVector mem)));
10739   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
10740   ins_encode %{
10741     int vector_len = 1;
10742     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10743   %}
10744   ins_pipe( pipe_slow );
10745 %}
10746 
10747 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
10748   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10749   match(Set dst (XorV src1 src2));
10750   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
10751   ins_encode %{
10752     int vector_len = 2;
10753     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
10754   %}
10755   ins_pipe( pipe_slow );
10756 %}
10757 
10758 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
10759   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
10760   match(Set dst (XorV src (LoadVector mem)));
10761   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
10762   ins_encode %{
10763     int vector_len = 2;
10764     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
10765   %}
10766   ins_pipe( pipe_slow );
10767 %}
10768 
10769 // --------------------------------- FMA --------------------------------------
10770 
10771 // a * b + c
10772 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
10773   predicate(UseFMA && n->as_Vector()->length() == 2);
10774   match(Set c (FmaVD  c (Binary a b)));
10775   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
10776   ins_cost(150);
10777   ins_encode %{
10778     int vector_len = 0;
10779     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10780   %}
10781   ins_pipe( pipe_slow );
10782 %}
10783 
10784 // a * b + c
10785 instruct vfma2D_mem(vecX a, memory b, vecX c) %{
10786   predicate(UseFMA && n->as_Vector()->length() == 2);
10787   match(Set c (FmaVD  c (Binary a (LoadVector b))));
10788   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
10789   ins_cost(150);
10790   ins_encode %{
10791     int vector_len = 0;
10792     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10793   %}
10794   ins_pipe( pipe_slow );
10795 %}
10796 
10797 
10798 // a * b + c
10799 instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
10800   predicate(UseFMA && n->as_Vector()->length() == 4);
10801   match(Set c (FmaVD  c (Binary a b)));
10802   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
10803   ins_cost(150);
10804   ins_encode %{
10805     int vector_len = 1;
10806     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10807   %}
10808   ins_pipe( pipe_slow );
10809 %}
10810 
10811 // a * b + c
10812 instruct vfma4D_mem(vecY a, memory b, vecY c) %{
10813   predicate(UseFMA && n->as_Vector()->length() == 4);
10814   match(Set c (FmaVD  c (Binary a (LoadVector b))));
10815   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
10816   ins_cost(150);
10817   ins_encode %{
10818     int vector_len = 1;
10819     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10820   %}
10821   ins_pipe( pipe_slow );
10822 %}
10823 
10824 // a * b + c
10825 instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
10826   predicate(UseFMA && n->as_Vector()->length() == 8);
10827   match(Set c (FmaVD  c (Binary a b)));
10828   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
10829   ins_cost(150);
10830   ins_encode %{
10831     int vector_len = 2;
10832     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10833   %}
10834   ins_pipe( pipe_slow );
10835 %}
10836 
10837 // a * b + c
10838 instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
10839   predicate(UseFMA && n->as_Vector()->length() == 8);
10840   match(Set c (FmaVD  c (Binary a (LoadVector b))));
10841   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
10842   ins_cost(150);
10843   ins_encode %{
10844     int vector_len = 2;
10845     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10846   %}
10847   ins_pipe( pipe_slow );
10848 %}
10849 
10850 // a * b + c
10851 instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
10852   predicate(UseFMA && n->as_Vector()->length() == 4);
10853   match(Set c (FmaVF  c (Binary a b)));
10854   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
10855   ins_cost(150);
10856   ins_encode %{
10857     int vector_len = 0;
10858     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10859   %}
10860   ins_pipe( pipe_slow );
10861 %}
10862 
10863 // a * b + c
10864 instruct vfma4F_mem(vecX a, memory b, vecX c) %{
10865   predicate(UseFMA && n->as_Vector()->length() == 4);
10866   match(Set c (FmaVF  c (Binary a (LoadVector b))));
10867   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
10868   ins_cost(150);
10869   ins_encode %{
10870     int vector_len = 0;
10871     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10872   %}
10873   ins_pipe( pipe_slow );
10874 %}
10875 
10876 // a * b + c
10877 instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
10878   predicate(UseFMA && n->as_Vector()->length() == 8);
10879   match(Set c (FmaVF  c (Binary a b)));
10880   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
10881   ins_cost(150);
10882   ins_encode %{
10883     int vector_len = 1;
10884     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10885   %}
10886   ins_pipe( pipe_slow );
10887 %}
10888 
10889 // a * b + c
10890 instruct vfma8F_mem(vecY a, memory b, vecY c) %{
10891   predicate(UseFMA && n->as_Vector()->length() == 8);
10892   match(Set c (FmaVF  c (Binary a (LoadVector b))));
10893   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
10894   ins_cost(150);
10895   ins_encode %{
10896     int vector_len = 1;
10897     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10898   %}
10899   ins_pipe( pipe_slow );
10900 %}
10901 
10902 // a * b + c
10903 instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
10904   predicate(UseFMA && n->as_Vector()->length() == 16);
10905   match(Set c (FmaVF  c (Binary a b)));
10906   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
10907   ins_cost(150);
10908   ins_encode %{
10909     int vector_len = 2;
10910     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
10911   %}
10912   ins_pipe( pipe_slow );
10913 %}
10914 
10915 // a * b + c
10916 instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
10917   predicate(UseFMA && n->as_Vector()->length() == 16);
10918   match(Set c (FmaVF  c (Binary a (LoadVector b))));
10919   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
10920   ins_cost(150);
10921   ins_encode %{
10922     int vector_len = 2;
10923     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
10924   %}
10925   ins_pipe( pipe_slow );
10926 %}
10927 
10928 // --------------------------------- PopCount --------------------------------------
10929 
10930 instruct vpopcount2I(vecD dst, vecD src) %{
10931   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
10932   match(Set dst (PopCountVI src));
10933   format %{ "vpopcntd  $dst,$src\t! vector popcount packed2I" %}
10934   ins_encode %{
10935     int vector_len = 0;
10936     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10937   %}
10938   ins_pipe( pipe_slow );
10939 %}
10940 
10941 instruct vpopcount4I(vecX dst, vecX src) %{
10942   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
10943   match(Set dst (PopCountVI src));
10944   format %{ "vpopcntd  $dst,$src\t! vector popcount packed4I" %}
10945   ins_encode %{
10946     int vector_len = 0;
10947     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10948   %}
10949   ins_pipe( pipe_slow );
10950 %}
10951 
10952 instruct vpopcount8I(vecY dst, vecY src) %{
10953   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
10954   match(Set dst (PopCountVI src));
10955   format %{ "vpopcntd  $dst,$src\t! vector popcount packed8I" %}
10956   ins_encode %{
10957     int vector_len = 1;
10958     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10959   %}
10960   ins_pipe( pipe_slow );
10961 %}
10962 
10963 instruct vpopcount16I(vecZ dst, vecZ src) %{
10964   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
10965   match(Set dst (PopCountVI src));
10966   format %{ "vpopcntd  $dst,$src\t! vector popcount packed16I" %}
10967   ins_encode %{
10968     int vector_len = 2;
10969     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
10970   %}
10971   ins_pipe( pipe_slow );
10972 %}