1 //
   2 // Copyright (c) 2011, 2012, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  256-bit registers or 8 words each, labeled (a)-h.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // XMM8-XMM15 must be encoded with REX (VEX for UseAVX).
  68 // Linux ABI:   No register preserved across function calls
  69 //              XMM0-XMM7 might hold parameters
  70 // Windows ABI: XMM6-XMM15 preserved across function calls
  71 //              XMM0-XMM3 might hold parameters
  72 
  73 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  74 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  75 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  76 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  77 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  78 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  79 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  80 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  81 
  82 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  83 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  84 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  85 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  86 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  87 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  88 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  89 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  90 
  91 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  92 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  93 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  94 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  95 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  96 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  97 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  98 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  99 
 100 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 101 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 102 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 103 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 104 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 105 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 106 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 107 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 108 
 109 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 110 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 111 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 112 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 113 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 114 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 115 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 116 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 117 
 118 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 119 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 120 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 121 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 122 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 123 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 124 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 125 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 126 
 127 #ifdef _WIN64
 128 
 129 reg_def XMM6 ( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg());
 130 reg_def XMM6b( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 131 reg_def XMM6c( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 132 reg_def XMM6d( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 133 reg_def XMM6e( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 134 reg_def XMM6f( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 135 reg_def XMM6g( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 136 reg_def XMM6h( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 137 
 138 reg_def XMM7 ( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg());
 139 reg_def XMM7b( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 140 reg_def XMM7c( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 141 reg_def XMM7d( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 142 reg_def XMM7e( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 143 reg_def XMM7f( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 144 reg_def XMM7g( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 145 reg_def XMM7h( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 146 
 147 reg_def XMM8 ( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg());
 148 reg_def XMM8b( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 149 reg_def XMM8c( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 150 reg_def XMM8d( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 151 reg_def XMM8e( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 152 reg_def XMM8f( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 153 reg_def XMM8g( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 154 reg_def XMM8h( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 155 
 156 reg_def XMM9 ( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg());
 157 reg_def XMM9b( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 158 reg_def XMM9c( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 159 reg_def XMM9d( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 160 reg_def XMM9e( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 161 reg_def XMM9f( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 162 reg_def XMM9g( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 163 reg_def XMM9h( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 164 
 165 reg_def XMM10 ( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg());
 166 reg_def XMM10b( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 167 reg_def XMM10c( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 168 reg_def XMM10d( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 169 reg_def XMM10e( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 170 reg_def XMM10f( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 171 reg_def XMM10g( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 172 reg_def XMM10h( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 173 
 174 reg_def XMM11 ( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg());
 175 reg_def XMM11b( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 176 reg_def XMM11c( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 177 reg_def XMM11d( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 178 reg_def XMM11e( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 179 reg_def XMM11f( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 180 reg_def XMM11g( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 181 reg_def XMM11h( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 182 
 183 reg_def XMM12 ( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg());
 184 reg_def XMM12b( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 185 reg_def XMM12c( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 186 reg_def XMM12d( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 187 reg_def XMM12e( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 188 reg_def XMM12f( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 189 reg_def XMM12g( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 190 reg_def XMM12h( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 191 
 192 reg_def XMM13 ( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg());
 193 reg_def XMM13b( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 194 reg_def XMM13c( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 195 reg_def XMM13d( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 196 reg_def XMM13e( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 197 reg_def XMM13f( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 198 reg_def XMM13g( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 199 reg_def XMM13h( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 200 
 201 reg_def XMM14 ( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg());
 202 reg_def XMM14b( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 203 reg_def XMM14c( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 204 reg_def XMM14d( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 205 reg_def XMM14e( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 206 reg_def XMM14f( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 207 reg_def XMM14g( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 208 reg_def XMM14h( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 209 
 210 reg_def XMM15 ( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg());
 211 reg_def XMM15b( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 212 reg_def XMM15c( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 213 reg_def XMM15d( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 214 reg_def XMM15e( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 215 reg_def XMM15f( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 216 reg_def XMM15g( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 217 reg_def XMM15h( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 218 
 219 #else // _WIN64
 220 
 221 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 222 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 223 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 224 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 225 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 226 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 227 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 228 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 229 
 230 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 231 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 232 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 233 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 234 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 235 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 236 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 237 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 238 
 239 #ifdef _LP64
 240 
 241 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 242 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 243 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 244 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 245 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 246 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 247 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 248 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 249 
 250 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 251 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 252 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 253 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 254 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 255 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 256 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 257 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 258 
 259 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 260 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 261 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 262 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 263 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 264 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 265 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 266 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 267 
 268 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 269 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 270 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 271 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 272 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 273 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 274 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 275 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 276 
 277 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 278 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 279 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 280 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 281 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 282 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 283 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 284 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 285 
 286 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 287 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 288 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 289 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 290 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 291 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 292 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 293 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 294 
 295 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 296 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 297 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 298 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 299 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 300 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 301 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 302 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 303 
 304 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 305 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 306 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 307 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 308 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 309 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 310 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 311 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 312 
 313 #endif // _LP64
 314 
 315 #endif // _WIN64
 316 
 317 #ifdef _LP64
 318 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 319 #else
 320 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 321 #endif // _LP64
 322 
 323 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 324                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 325                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 326                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 327                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 328                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 329                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 330                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 331 #ifdef _LP64
 332                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 333                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 334                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 335                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 336                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 337                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 338                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 339                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 340 #endif
 341                    );
 342 
 343 // flags allocation class should be last.
 344 alloc_class chunk2(RFLAGS);
 345 
 346 // Singleton class for condition codes
 347 reg_class int_flags(RFLAGS);
 348 
 349 // Class for all float registers
 350 reg_class float_reg(XMM0,
 351                     XMM1,
 352                     XMM2,
 353                     XMM3,
 354                     XMM4,
 355                     XMM5,
 356                     XMM6,
 357                     XMM7
 358 #ifdef _LP64
 359                    ,XMM8,
 360                     XMM9,
 361                     XMM10,
 362                     XMM11,
 363                     XMM12,
 364                     XMM13,
 365                     XMM14,
 366                     XMM15
 367 #endif
 368                     );
 369 
 370 // Class for all double registers
 371 reg_class double_reg(XMM0,  XMM0b,
 372                      XMM1,  XMM1b,
 373                      XMM2,  XMM2b,
 374                      XMM3,  XMM3b,
 375                      XMM4,  XMM4b,
 376                      XMM5,  XMM5b,
 377                      XMM6,  XMM6b,
 378                      XMM7,  XMM7b
 379 #ifdef _LP64
 380                     ,XMM8,  XMM8b,
 381                      XMM9,  XMM9b,
 382                      XMM10, XMM10b,
 383                      XMM11, XMM11b,
 384                      XMM12, XMM12b,
 385                      XMM13, XMM13b,
 386                      XMM14, XMM14b,
 387                      XMM15, XMM15b
 388 #endif
 389                      );
 390 
 391 // Class for all 32bit vector registers
 392 reg_class vectors_reg(XMM0,
 393                       XMM1,
 394                       XMM2,
 395                       XMM3,
 396                       XMM4,
 397                       XMM5,
 398                       XMM6,
 399                       XMM7
 400 #ifdef _LP64
 401                      ,XMM8,
 402                       XMM9,
 403                       XMM10,
 404                       XMM11,
 405                       XMM12,
 406                       XMM13,
 407                       XMM14,
 408                       XMM15
 409 #endif
 410                       );
 411 
 412 // Class for all 64bit vector registers
 413 reg_class vectord_reg(XMM0,  XMM0b,
 414                       XMM1,  XMM1b,
 415                       XMM2,  XMM2b,
 416                       XMM3,  XMM3b,
 417                       XMM4,  XMM4b,
 418                       XMM5,  XMM5b,
 419                       XMM6,  XMM6b,
 420                       XMM7,  XMM7b
 421 #ifdef _LP64
 422                      ,XMM8,  XMM8b,
 423                       XMM9,  XMM9b,
 424                       XMM10, XMM10b,
 425                       XMM11, XMM11b,
 426                       XMM12, XMM12b,
 427                       XMM13, XMM13b,
 428                       XMM14, XMM14b,
 429                       XMM15, XMM15b
 430 #endif
 431                       );
 432 
 433 // Class for all 128bit vector registers
 434 reg_class vectorx_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,
 435                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 436                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 437                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 438                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 439                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 440                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 441                       XMM7,  XMM7b,  XMM7c,  XMM7d
 442 #ifdef _LP64
 443                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 444                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 445                       XMM10, XMM10b, XMM10c, XMM10d,
 446                       XMM11, XMM11b, XMM11c, XMM11d,
 447                       XMM12, XMM12b, XMM12c, XMM12d,
 448                       XMM13, XMM13b, XMM13c, XMM13d,
 449                       XMM14, XMM14b, XMM14c, XMM14d,
 450                       XMM15, XMM15b, XMM15c, XMM15d
 451 #endif
 452                       );
 453 
 454 // Class for all 256bit vector registers
 455 reg_class vectory_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 456                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 457                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 458                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 459                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 460                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 461                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 462                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 463 #ifdef _LP64
 464                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 465                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 466                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 467                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 468                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 469                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 470                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 471                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 472 #endif
 473                       );
 474 
 475 %}
 476 
 477 
 478 //----------SOURCE BLOCK-------------------------------------------------------
 479 // This is a block of C++ code which provides values, functions, and
 480 // definitions necessary in the rest of the architecture description
 481 
 482 source_hpp %{
 483 // Header information of the source block.
 484 // Method declarations/definitions which are used outside
 485 // the ad-scope can conveniently be defined here.
 486 //
 487 // To keep related declarations/definitions/uses close together,
 488 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 489 
 490 class CallStubImpl {
 491  
 492   //--------------------------------------------------------------
 493   //---<  Used for optimization in Compile::shorten_branches  >---
 494   //--------------------------------------------------------------
 495 
 496  public:
 497   // Size of call trampoline stub.
 498   static uint size_call_trampoline() {
 499     return 0; // no call trampolines on this platform
 500   }
 501   
 502   // number of relocations needed by a call trampoline stub
 503   static uint reloc_call_trampoline() { 
 504     return 0; // no call trampolines on this platform
 505   }
 506 };
 507 
 508 class HandlerImpl {
 509 
 510  public:
 511 
 512   static int emit_exception_handler(CodeBuffer &cbuf);
 513   static int emit_deopt_handler(CodeBuffer& cbuf);
 514 
 515   static uint size_exception_handler() {
 516     // NativeCall instruction size is the same as NativeJump.
 517     // exception handler starts out as jump and can be patched to
 518     // a call be deoptimization.  (4932387)
 519     // Note that this value is also credited (in output.cpp) to
 520     // the size of the code section.
 521     return NativeJump::instruction_size;
 522   }
 523 
 524 #ifdef _LP64
 525   static uint size_deopt_handler() {
 526     // three 5 byte instructions
 527     return 15;
 528   }
 529 #else
 530   static uint size_deopt_handler() {
 531     // NativeCall instruction size is the same as NativeJump.
 532     // exception handler starts out as jump and can be patched to
 533     // a call be deoptimization.  (4932387)
 534     // Note that this value is also credited (in output.cpp) to
 535     // the size of the code section.
 536     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 537   }
 538 #endif
 539 };
 540 
 541 %} // end source_hpp
 542 
 543 source %{
 544 
 545 // Emit exception handler code.
 546 // Stuff framesize into a register and call a VM stub routine.
 547 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
 548 
 549   // Note that the code buffer's insts_mark is always relative to insts.
 550   // That's why we must use the macroassembler to generate a handler.
 551   MacroAssembler _masm(&cbuf);
 552   address base = __ start_a_stub(size_exception_handler());
 553   if (base == NULL)  return 0;  // CodeBuffer::expand failed
 554   int offset = __ offset();
 555   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 556   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 557   __ end_a_stub();
 558   return offset;
 559 }
 560 
 561 // Emit deopt handler code.
 562 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
 563 
 564   // Note that the code buffer's insts_mark is always relative to insts.
 565   // That's why we must use the macroassembler to generate a handler.
 566   MacroAssembler _masm(&cbuf);
 567   address base = __ start_a_stub(size_deopt_handler());
 568   if (base == NULL)  return 0;  // CodeBuffer::expand failed
 569   int offset = __ offset();
 570 
 571 #ifdef _LP64
 572   address the_pc = (address) __ pc();
 573   Label next;
 574   // push a "the_pc" on the stack without destroying any registers
 575   // as they all may be live.
 576 
 577   // push address of "next"
 578   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 579   __ bind(next);
 580   // adjust it so it matches "the_pc"
 581   __ subptr(Address(rsp, 0), __ offset() - offset);
 582 #else
 583   InternalAddress here(__ pc());
 584   __ pushptr(here.addr());
 585 #endif
 586 
 587   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 588   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
 589   __ end_a_stub();
 590   return offset;
 591 }
 592 
 593 
 594 //=============================================================================
 595 
 596   // Float masks come from different places depending on platform.
 597 #ifdef _LP64
 598   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 599   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 600   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 601   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 602 #else
 603   static address float_signmask()  { return (address)float_signmask_pool; }
 604   static address float_signflip()  { return (address)float_signflip_pool; }
 605   static address double_signmask() { return (address)double_signmask_pool; }
 606   static address double_signflip() { return (address)double_signflip_pool; }
 607 #endif
 608 
 609 
 610 const bool Matcher::match_rule_supported(int opcode) {
 611   if (!has_match_rule(opcode))
 612     return false;
 613 
 614   switch (opcode) {
 615     case Op_PopCountI:
 616     case Op_PopCountL:
 617       if (!UsePopCountInstruction)
 618         return false;
 619     break;
 620     case Op_MulVI:
 621       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
 622         return false;
 623     break;
 624     case Op_CompareAndSwapL:
 625 #ifdef _LP64
 626     case Op_CompareAndSwapP:
 627 #endif
 628       if (!VM_Version::supports_cx8())
 629         return false;
 630     break;
 631   }
 632 
 633   return true;  // Per default match rules are supported.
 634 }
 635 
 636 // Max vector size in bytes. 0 if not supported.
 637 const int Matcher::vector_width_in_bytes(BasicType bt) {
 638   assert(is_java_primitive(bt), "only primitive type vectors");
 639   if (UseSSE < 2) return 0;
 640   // SSE2 supports 128bit vectors for all types.
 641   // AVX2 supports 256bit vectors for all types.
 642   int size = (UseAVX > 1) ? 32 : 16;
 643   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 644   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 645     size = 32;
 646   // Use flag to limit vector size.
 647   size = MIN2(size,(int)MaxVectorSize);
 648   // Minimum 2 values in vector (or 4 for bytes).
 649   switch (bt) {
 650   case T_DOUBLE:
 651   case T_LONG:
 652     if (size < 16) return 0;
 653   case T_FLOAT:
 654   case T_INT:
 655     if (size < 8) return 0;
 656   case T_BOOLEAN:
 657   case T_BYTE:
 658   case T_CHAR:
 659   case T_SHORT:
 660     if (size < 4) return 0;
 661     break;
 662   default:
 663     ShouldNotReachHere();
 664   }
 665   return size;
 666 }
 667 
 668 // Limits on vector size (number of elements) loaded into vector.
 669 const int Matcher::max_vector_size(const BasicType bt) {
 670   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 671 }
 672 const int Matcher::min_vector_size(const BasicType bt) {
 673   int max_size = max_vector_size(bt);
 674   // Min size which can be loaded into vector is 4 bytes.
 675   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 676   return MIN2(size,max_size);
 677 }
 678 
 679 // Vector ideal reg corresponding to specidied size in bytes
 680 const int Matcher::vector_ideal_reg(int size) {
 681   assert(MaxVectorSize >= size, "");
 682   switch(size) {
 683     case  4: return Op_VecS;
 684     case  8: return Op_VecD;
 685     case 16: return Op_VecX;
 686     case 32: return Op_VecY;
 687   }
 688   ShouldNotReachHere();
 689   return 0;
 690 }
 691 
 692 // Only lowest bits of xmm reg are used for vector shift count.
 693 const int Matcher::vector_shift_count_ideal_reg(int size) {
 694   return Op_VecS;
 695 }
 696 
 697 // x86 supports misaligned vectors store/load.
 698 const bool Matcher::misaligned_vectors_ok() {
 699   return !AlignVector; // can be changed by flag
 700 }
 701 
 702 // x86 AES instructions are compatible with SunJCE expanded
 703 // keys, hence we do not need to pass the original key to stubs
 704 const bool Matcher::pass_original_key_for_aes() {
 705   return false;
 706 }
 707 
 708 // Helper methods for MachSpillCopyNode::implementation().
 709 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
 710                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 711   // In 64-bit VM size calculation is very complex. Emitting instructions
 712   // into scratch buffer is used to get size in 64-bit VM.
 713   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
 714   assert(ireg == Op_VecS || // 32bit vector
 715          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 716          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
 717          "no non-adjacent vector moves" );
 718   if (cbuf) {
 719     MacroAssembler _masm(cbuf);
 720     int offset = __ offset();
 721     switch (ireg) {
 722     case Op_VecS: // copy whole register
 723     case Op_VecD:
 724     case Op_VecX:
 725       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 726       break;
 727     case Op_VecY:
 728       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 729       break;
 730     default:
 731       ShouldNotReachHere();
 732     }
 733     int size = __ offset() - offset;
 734 #ifdef ASSERT
 735     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
 736     assert(!do_size || size == 4, "incorrect size calculattion");
 737 #endif
 738     return size;
 739 #ifndef PRODUCT
 740   } else if (!do_size) {
 741     switch (ireg) {
 742     case Op_VecS:
 743     case Op_VecD:
 744     case Op_VecX:
 745       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 746       break;
 747     case Op_VecY:
 748       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 749       break;
 750     default:
 751       ShouldNotReachHere();
 752     }
 753 #endif
 754   }
 755   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
 756   return 4;
 757 }
 758 
 759 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
 760                             int stack_offset, int reg, uint ireg, outputStream* st) {
 761   // In 64-bit VM size calculation is very complex. Emitting instructions
 762   // into scratch buffer is used to get size in 64-bit VM.
 763   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
 764   if (cbuf) {
 765     MacroAssembler _masm(cbuf);
 766     int offset = __ offset();
 767     if (is_load) {
 768       switch (ireg) {
 769       case Op_VecS:
 770         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 771         break;
 772       case Op_VecD:
 773         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 774         break;
 775       case Op_VecX:
 776         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 777         break;
 778       case Op_VecY:
 779         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 780         break;
 781       default:
 782         ShouldNotReachHere();
 783       }
 784     } else { // store
 785       switch (ireg) {
 786       case Op_VecS:
 787         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 788         break;
 789       case Op_VecD:
 790         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 791         break;
 792       case Op_VecX:
 793         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 794         break;
 795       case Op_VecY:
 796         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 797         break;
 798       default:
 799         ShouldNotReachHere();
 800       }
 801     }
 802     int size = __ offset() - offset;
 803 #ifdef ASSERT
 804     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
 805     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
 806     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
 807 #endif
 808     return size;
 809 #ifndef PRODUCT
 810   } else if (!do_size) {
 811     if (is_load) {
 812       switch (ireg) {
 813       case Op_VecS:
 814         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 815         break;
 816       case Op_VecD:
 817         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 818         break;
 819        case Op_VecX:
 820         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 821         break;
 822       case Op_VecY:
 823         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 824         break;
 825       default:
 826         ShouldNotReachHere();
 827       }
 828     } else { // store
 829       switch (ireg) {
 830       case Op_VecS:
 831         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 832         break;
 833       case Op_VecD:
 834         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 835         break;
 836        case Op_VecX:
 837         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 838         break;
 839       case Op_VecY:
 840         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 841         break;
 842       default:
 843         ShouldNotReachHere();
 844       }
 845     }
 846 #endif
 847   }
 848   int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
 849   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
 850   return 5+offset_size;
 851 }
 852 
 853 static inline jfloat replicate4_imm(int con, int width) {
 854   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
 855   assert(width == 1 || width == 2, "only byte or short types here");
 856   int bit_width = width * 8;
 857   jint val = con;
 858   val &= (1 << bit_width) - 1;  // mask off sign bits
 859   while(bit_width < 32) {
 860     val |= (val << bit_width);
 861     bit_width <<= 1;
 862   }
 863   jfloat fval = *((jfloat*) &val);  // coerce to float type
 864   return fval;
 865 }
 866 
 867 static inline jdouble replicate8_imm(int con, int width) {
 868   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
 869   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
 870   int bit_width = width * 8;
 871   jlong val = con;
 872   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
 873   while(bit_width < 64) {
 874     val |= (val << bit_width);
 875     bit_width <<= 1;
 876   }
 877   jdouble dval = *((jdouble*) &val);  // coerce to double type
 878   return dval;
 879 }
 880 
 881 #ifndef PRODUCT
 882   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 883     st->print("nop \t# %d bytes pad for loops and calls", _count);
 884   }
 885 #endif
 886 
 887   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
 888     MacroAssembler _masm(&cbuf);
 889     __ nop(_count);
 890   }
 891 
 892   uint MachNopNode::size(PhaseRegAlloc*) const {
 893     return _count;
 894   }
 895 
 896 #ifndef PRODUCT
 897   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 898     st->print("# breakpoint");
 899   }
 900 #endif
 901 
 902   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
 903     MacroAssembler _masm(&cbuf);
 904     __ int3();
 905   }
 906 
 907   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 908     return MachNode::size(ra_);
 909   }
 910 
 911 %}
 912 
 913 encode %{
 914 
 915   enc_class call_epilog %{
 916     if (VerifyStackAtCalls) {
 917       // Check that stack depth is unchanged: find majik cookie on stack
 918       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 919       MacroAssembler _masm(&cbuf);
 920       Label L;
 921       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 922       __ jccb(Assembler::equal, L);
 923       // Die if stack mismatch
 924       __ int3();
 925       __ bind(L);
 926     }
 927   %}
 928 
 929 %}
 930 
 931 
 932 //----------OPERANDS-----------------------------------------------------------
 933 // Operand definitions must precede instruction definitions for correct parsing
 934 // in the ADLC because operands constitute user defined types which are used in
 935 // instruction definitions.
 936 
 937 // Vectors
 938 operand vecS() %{
 939   constraint(ALLOC_IN_RC(vectors_reg));
 940   match(VecS);
 941 
 942   format %{ %}
 943   interface(REG_INTER);
 944 %}
 945 
 946 operand vecD() %{
 947   constraint(ALLOC_IN_RC(vectord_reg));
 948   match(VecD);
 949 
 950   format %{ %}
 951   interface(REG_INTER);
 952 %}
 953 
 954 operand vecX() %{
 955   constraint(ALLOC_IN_RC(vectorx_reg));
 956   match(VecX);
 957 
 958   format %{ %}
 959   interface(REG_INTER);
 960 %}
 961 
 962 operand vecY() %{
 963   constraint(ALLOC_IN_RC(vectory_reg));
 964   match(VecY);
 965 
 966   format %{ %}
 967   interface(REG_INTER);
 968 %}
 969 
 970 
 971 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 972 
 973 // ============================================================================
 974 
 975 instruct ShouldNotReachHere() %{
 976   match(Halt);
 977   format %{ "int3\t# ShouldNotReachHere" %}
 978   ins_encode %{
 979     __ int3();
 980   %}
 981   ins_pipe(pipe_slow);
 982 %}
 983 
 984 // ============================================================================
 985 
 986 instruct addF_reg(regF dst, regF src) %{
 987   predicate((UseSSE>=1) && (UseAVX == 0));
 988   match(Set dst (AddF dst src));
 989 
 990   format %{ "addss   $dst, $src" %}
 991   ins_cost(150);
 992   ins_encode %{
 993     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 994   %}
 995   ins_pipe(pipe_slow);
 996 %}
 997 
 998 instruct addF_mem(regF dst, memory src) %{
 999   predicate((UseSSE>=1) && (UseAVX == 0));
1000   match(Set dst (AddF dst (LoadF src)));
1001 
1002   format %{ "addss   $dst, $src" %}
1003   ins_cost(150);
1004   ins_encode %{
1005     __ addss($dst$$XMMRegister, $src$$Address);
1006   %}
1007   ins_pipe(pipe_slow);
1008 %}
1009 
1010 instruct addF_imm(regF dst, immF con) %{
1011   predicate((UseSSE>=1) && (UseAVX == 0));
1012   match(Set dst (AddF dst con));
1013   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1014   ins_cost(150);
1015   ins_encode %{
1016     __ addss($dst$$XMMRegister, $constantaddress($con));
1017   %}
1018   ins_pipe(pipe_slow);
1019 %}
1020 
1021 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
1022   predicate(UseAVX > 0);
1023   match(Set dst (AddF src1 src2));
1024 
1025   format %{ "vaddss  $dst, $src1, $src2" %}
1026   ins_cost(150);
1027   ins_encode %{
1028     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1029   %}
1030   ins_pipe(pipe_slow);
1031 %}
1032 
1033 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
1034   predicate(UseAVX > 0);
1035   match(Set dst (AddF src1 (LoadF src2)));
1036 
1037   format %{ "vaddss  $dst, $src1, $src2" %}
1038   ins_cost(150);
1039   ins_encode %{
1040     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1041   %}
1042   ins_pipe(pipe_slow);
1043 %}
1044 
1045 instruct addF_reg_imm(regF dst, regF src, immF con) %{
1046   predicate(UseAVX > 0);
1047   match(Set dst (AddF src con));
1048 
1049   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1050   ins_cost(150);
1051   ins_encode %{
1052     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1053   %}
1054   ins_pipe(pipe_slow);
1055 %}
1056 
1057 instruct addD_reg(regD dst, regD src) %{
1058   predicate((UseSSE>=2) && (UseAVX == 0));
1059   match(Set dst (AddD dst src));
1060 
1061   format %{ "addsd   $dst, $src" %}
1062   ins_cost(150);
1063   ins_encode %{
1064     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
1065   %}
1066   ins_pipe(pipe_slow);
1067 %}
1068 
1069 instruct addD_mem(regD dst, memory src) %{
1070   predicate((UseSSE>=2) && (UseAVX == 0));
1071   match(Set dst (AddD dst (LoadD src)));
1072 
1073   format %{ "addsd   $dst, $src" %}
1074   ins_cost(150);
1075   ins_encode %{
1076     __ addsd($dst$$XMMRegister, $src$$Address);
1077   %}
1078   ins_pipe(pipe_slow);
1079 %}
1080 
1081 instruct addD_imm(regD dst, immD con) %{
1082   predicate((UseSSE>=2) && (UseAVX == 0));
1083   match(Set dst (AddD dst con));
1084   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1085   ins_cost(150);
1086   ins_encode %{
1087     __ addsd($dst$$XMMRegister, $constantaddress($con));
1088   %}
1089   ins_pipe(pipe_slow);
1090 %}
1091 
1092 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
1093   predicate(UseAVX > 0);
1094   match(Set dst (AddD src1 src2));
1095 
1096   format %{ "vaddsd  $dst, $src1, $src2" %}
1097   ins_cost(150);
1098   ins_encode %{
1099     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1100   %}
1101   ins_pipe(pipe_slow);
1102 %}
1103 
1104 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
1105   predicate(UseAVX > 0);
1106   match(Set dst (AddD src1 (LoadD src2)));
1107 
1108   format %{ "vaddsd  $dst, $src1, $src2" %}
1109   ins_cost(150);
1110   ins_encode %{
1111     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1112   %}
1113   ins_pipe(pipe_slow);
1114 %}
1115 
1116 instruct addD_reg_imm(regD dst, regD src, immD con) %{
1117   predicate(UseAVX > 0);
1118   match(Set dst (AddD src con));
1119 
1120   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1121   ins_cost(150);
1122   ins_encode %{
1123     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1124   %}
1125   ins_pipe(pipe_slow);
1126 %}
1127 
1128 instruct subF_reg(regF dst, regF src) %{
1129   predicate((UseSSE>=1) && (UseAVX == 0));
1130   match(Set dst (SubF dst src));
1131 
1132   format %{ "subss   $dst, $src" %}
1133   ins_cost(150);
1134   ins_encode %{
1135     __ subss($dst$$XMMRegister, $src$$XMMRegister);
1136   %}
1137   ins_pipe(pipe_slow);
1138 %}
1139 
1140 instruct subF_mem(regF dst, memory src) %{
1141   predicate((UseSSE>=1) && (UseAVX == 0));
1142   match(Set dst (SubF dst (LoadF src)));
1143 
1144   format %{ "subss   $dst, $src" %}
1145   ins_cost(150);
1146   ins_encode %{
1147     __ subss($dst$$XMMRegister, $src$$Address);
1148   %}
1149   ins_pipe(pipe_slow);
1150 %}
1151 
1152 instruct subF_imm(regF dst, immF con) %{
1153   predicate((UseSSE>=1) && (UseAVX == 0));
1154   match(Set dst (SubF dst con));
1155   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1156   ins_cost(150);
1157   ins_encode %{
1158     __ subss($dst$$XMMRegister, $constantaddress($con));
1159   %}
1160   ins_pipe(pipe_slow);
1161 %}
1162 
1163 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
1164   predicate(UseAVX > 0);
1165   match(Set dst (SubF src1 src2));
1166 
1167   format %{ "vsubss  $dst, $src1, $src2" %}
1168   ins_cost(150);
1169   ins_encode %{
1170     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1171   %}
1172   ins_pipe(pipe_slow);
1173 %}
1174 
1175 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
1176   predicate(UseAVX > 0);
1177   match(Set dst (SubF src1 (LoadF src2)));
1178 
1179   format %{ "vsubss  $dst, $src1, $src2" %}
1180   ins_cost(150);
1181   ins_encode %{
1182     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1183   %}
1184   ins_pipe(pipe_slow);
1185 %}
1186 
1187 instruct subF_reg_imm(regF dst, regF src, immF con) %{
1188   predicate(UseAVX > 0);
1189   match(Set dst (SubF src con));
1190 
1191   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1192   ins_cost(150);
1193   ins_encode %{
1194     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1195   %}
1196   ins_pipe(pipe_slow);
1197 %}
1198 
1199 instruct subD_reg(regD dst, regD src) %{
1200   predicate((UseSSE>=2) && (UseAVX == 0));
1201   match(Set dst (SubD dst src));
1202 
1203   format %{ "subsd   $dst, $src" %}
1204   ins_cost(150);
1205   ins_encode %{
1206     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
1207   %}
1208   ins_pipe(pipe_slow);
1209 %}
1210 
1211 instruct subD_mem(regD dst, memory src) %{
1212   predicate((UseSSE>=2) && (UseAVX == 0));
1213   match(Set dst (SubD dst (LoadD src)));
1214 
1215   format %{ "subsd   $dst, $src" %}
1216   ins_cost(150);
1217   ins_encode %{
1218     __ subsd($dst$$XMMRegister, $src$$Address);
1219   %}
1220   ins_pipe(pipe_slow);
1221 %}
1222 
1223 instruct subD_imm(regD dst, immD con) %{
1224   predicate((UseSSE>=2) && (UseAVX == 0));
1225   match(Set dst (SubD dst con));
1226   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1227   ins_cost(150);
1228   ins_encode %{
1229     __ subsd($dst$$XMMRegister, $constantaddress($con));
1230   %}
1231   ins_pipe(pipe_slow);
1232 %}
1233 
1234 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
1235   predicate(UseAVX > 0);
1236   match(Set dst (SubD src1 src2));
1237 
1238   format %{ "vsubsd  $dst, $src1, $src2" %}
1239   ins_cost(150);
1240   ins_encode %{
1241     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1242   %}
1243   ins_pipe(pipe_slow);
1244 %}
1245 
1246 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
1247   predicate(UseAVX > 0);
1248   match(Set dst (SubD src1 (LoadD src2)));
1249 
1250   format %{ "vsubsd  $dst, $src1, $src2" %}
1251   ins_cost(150);
1252   ins_encode %{
1253     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1254   %}
1255   ins_pipe(pipe_slow);
1256 %}
1257 
1258 instruct subD_reg_imm(regD dst, regD src, immD con) %{
1259   predicate(UseAVX > 0);
1260   match(Set dst (SubD src con));
1261 
1262   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1263   ins_cost(150);
1264   ins_encode %{
1265     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1266   %}
1267   ins_pipe(pipe_slow);
1268 %}
1269 
1270 instruct mulF_reg(regF dst, regF src) %{
1271   predicate((UseSSE>=1) && (UseAVX == 0));
1272   match(Set dst (MulF dst src));
1273 
1274   format %{ "mulss   $dst, $src" %}
1275   ins_cost(150);
1276   ins_encode %{
1277     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
1278   %}
1279   ins_pipe(pipe_slow);
1280 %}
1281 
1282 instruct mulF_mem(regF dst, memory src) %{
1283   predicate((UseSSE>=1) && (UseAVX == 0));
1284   match(Set dst (MulF dst (LoadF src)));
1285 
1286   format %{ "mulss   $dst, $src" %}
1287   ins_cost(150);
1288   ins_encode %{
1289     __ mulss($dst$$XMMRegister, $src$$Address);
1290   %}
1291   ins_pipe(pipe_slow);
1292 %}
1293 
1294 instruct mulF_imm(regF dst, immF con) %{
1295   predicate((UseSSE>=1) && (UseAVX == 0));
1296   match(Set dst (MulF dst con));
1297   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1298   ins_cost(150);
1299   ins_encode %{
1300     __ mulss($dst$$XMMRegister, $constantaddress($con));
1301   %}
1302   ins_pipe(pipe_slow);
1303 %}
1304 
1305 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
1306   predicate(UseAVX > 0);
1307   match(Set dst (MulF src1 src2));
1308 
1309   format %{ "vmulss  $dst, $src1, $src2" %}
1310   ins_cost(150);
1311   ins_encode %{
1312     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1313   %}
1314   ins_pipe(pipe_slow);
1315 %}
1316 
1317 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
1318   predicate(UseAVX > 0);
1319   match(Set dst (MulF src1 (LoadF src2)));
1320 
1321   format %{ "vmulss  $dst, $src1, $src2" %}
1322   ins_cost(150);
1323   ins_encode %{
1324     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1325   %}
1326   ins_pipe(pipe_slow);
1327 %}
1328 
1329 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
1330   predicate(UseAVX > 0);
1331   match(Set dst (MulF src con));
1332 
1333   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1334   ins_cost(150);
1335   ins_encode %{
1336     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1337   %}
1338   ins_pipe(pipe_slow);
1339 %}
1340 
1341 instruct mulD_reg(regD dst, regD src) %{
1342   predicate((UseSSE>=2) && (UseAVX == 0));
1343   match(Set dst (MulD dst src));
1344 
1345   format %{ "mulsd   $dst, $src" %}
1346   ins_cost(150);
1347   ins_encode %{
1348     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
1349   %}
1350   ins_pipe(pipe_slow);
1351 %}
1352 
1353 instruct mulD_mem(regD dst, memory src) %{
1354   predicate((UseSSE>=2) && (UseAVX == 0));
1355   match(Set dst (MulD dst (LoadD src)));
1356 
1357   format %{ "mulsd   $dst, $src" %}
1358   ins_cost(150);
1359   ins_encode %{
1360     __ mulsd($dst$$XMMRegister, $src$$Address);
1361   %}
1362   ins_pipe(pipe_slow);
1363 %}
1364 
1365 instruct mulD_imm(regD dst, immD con) %{
1366   predicate((UseSSE>=2) && (UseAVX == 0));
1367   match(Set dst (MulD dst con));
1368   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1369   ins_cost(150);
1370   ins_encode %{
1371     __ mulsd($dst$$XMMRegister, $constantaddress($con));
1372   %}
1373   ins_pipe(pipe_slow);
1374 %}
1375 
1376 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
1377   predicate(UseAVX > 0);
1378   match(Set dst (MulD src1 src2));
1379 
1380   format %{ "vmulsd  $dst, $src1, $src2" %}
1381   ins_cost(150);
1382   ins_encode %{
1383     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1384   %}
1385   ins_pipe(pipe_slow);
1386 %}
1387 
1388 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
1389   predicate(UseAVX > 0);
1390   match(Set dst (MulD src1 (LoadD src2)));
1391 
1392   format %{ "vmulsd  $dst, $src1, $src2" %}
1393   ins_cost(150);
1394   ins_encode %{
1395     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1396   %}
1397   ins_pipe(pipe_slow);
1398 %}
1399 
1400 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
1401   predicate(UseAVX > 0);
1402   match(Set dst (MulD src con));
1403 
1404   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1405   ins_cost(150);
1406   ins_encode %{
1407     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1408   %}
1409   ins_pipe(pipe_slow);
1410 %}
1411 
1412 instruct divF_reg(regF dst, regF src) %{
1413   predicate((UseSSE>=1) && (UseAVX == 0));
1414   match(Set dst (DivF dst src));
1415 
1416   format %{ "divss   $dst, $src" %}
1417   ins_cost(150);
1418   ins_encode %{
1419     __ divss($dst$$XMMRegister, $src$$XMMRegister);
1420   %}
1421   ins_pipe(pipe_slow);
1422 %}
1423 
1424 instruct divF_mem(regF dst, memory src) %{
1425   predicate((UseSSE>=1) && (UseAVX == 0));
1426   match(Set dst (DivF dst (LoadF src)));
1427 
1428   format %{ "divss   $dst, $src" %}
1429   ins_cost(150);
1430   ins_encode %{
1431     __ divss($dst$$XMMRegister, $src$$Address);
1432   %}
1433   ins_pipe(pipe_slow);
1434 %}
1435 
1436 instruct divF_imm(regF dst, immF con) %{
1437   predicate((UseSSE>=1) && (UseAVX == 0));
1438   match(Set dst (DivF dst con));
1439   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1440   ins_cost(150);
1441   ins_encode %{
1442     __ divss($dst$$XMMRegister, $constantaddress($con));
1443   %}
1444   ins_pipe(pipe_slow);
1445 %}
1446 
1447 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
1448   predicate(UseAVX > 0);
1449   match(Set dst (DivF src1 src2));
1450 
1451   format %{ "vdivss  $dst, $src1, $src2" %}
1452   ins_cost(150);
1453   ins_encode %{
1454     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1455   %}
1456   ins_pipe(pipe_slow);
1457 %}
1458 
1459 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
1460   predicate(UseAVX > 0);
1461   match(Set dst (DivF src1 (LoadF src2)));
1462 
1463   format %{ "vdivss  $dst, $src1, $src2" %}
1464   ins_cost(150);
1465   ins_encode %{
1466     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1467   %}
1468   ins_pipe(pipe_slow);
1469 %}
1470 
1471 instruct divF_reg_imm(regF dst, regF src, immF con) %{
1472   predicate(UseAVX > 0);
1473   match(Set dst (DivF src con));
1474 
1475   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
1476   ins_cost(150);
1477   ins_encode %{
1478     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1479   %}
1480   ins_pipe(pipe_slow);
1481 %}
1482 
1483 instruct divD_reg(regD dst, regD src) %{
1484   predicate((UseSSE>=2) && (UseAVX == 0));
1485   match(Set dst (DivD dst src));
1486 
1487   format %{ "divsd   $dst, $src" %}
1488   ins_cost(150);
1489   ins_encode %{
1490     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
1491   %}
1492   ins_pipe(pipe_slow);
1493 %}
1494 
1495 instruct divD_mem(regD dst, memory src) %{
1496   predicate((UseSSE>=2) && (UseAVX == 0));
1497   match(Set dst (DivD dst (LoadD src)));
1498 
1499   format %{ "divsd   $dst, $src" %}
1500   ins_cost(150);
1501   ins_encode %{
1502     __ divsd($dst$$XMMRegister, $src$$Address);
1503   %}
1504   ins_pipe(pipe_slow);
1505 %}
1506 
1507 instruct divD_imm(regD dst, immD con) %{
1508   predicate((UseSSE>=2) && (UseAVX == 0));
1509   match(Set dst (DivD dst con));
1510   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1511   ins_cost(150);
1512   ins_encode %{
1513     __ divsd($dst$$XMMRegister, $constantaddress($con));
1514   %}
1515   ins_pipe(pipe_slow);
1516 %}
1517 
1518 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
1519   predicate(UseAVX > 0);
1520   match(Set dst (DivD src1 src2));
1521 
1522   format %{ "vdivsd  $dst, $src1, $src2" %}
1523   ins_cost(150);
1524   ins_encode %{
1525     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
1526   %}
1527   ins_pipe(pipe_slow);
1528 %}
1529 
1530 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
1531   predicate(UseAVX > 0);
1532   match(Set dst (DivD src1 (LoadD src2)));
1533 
1534   format %{ "vdivsd  $dst, $src1, $src2" %}
1535   ins_cost(150);
1536   ins_encode %{
1537     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
1538   %}
1539   ins_pipe(pipe_slow);
1540 %}
1541 
1542 instruct divD_reg_imm(regD dst, regD src, immD con) %{
1543   predicate(UseAVX > 0);
1544   match(Set dst (DivD src con));
1545 
1546   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
1547   ins_cost(150);
1548   ins_encode %{
1549     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
1550   %}
1551   ins_pipe(pipe_slow);
1552 %}
1553 
1554 instruct absF_reg(regF dst) %{
1555   predicate((UseSSE>=1) && (UseAVX == 0));
1556   match(Set dst (AbsF dst));
1557   ins_cost(150);
1558   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
1559   ins_encode %{
1560     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
1561   %}
1562   ins_pipe(pipe_slow);
1563 %}
1564 
1565 instruct absF_reg_reg(regF dst, regF src) %{
1566   predicate(UseAVX > 0);
1567   match(Set dst (AbsF src));
1568   ins_cost(150);
1569   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
1570   ins_encode %{
1571     bool vector256 = false;
1572     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
1573               ExternalAddress(float_signmask()), vector256);
1574   %}
1575   ins_pipe(pipe_slow);
1576 %}
1577 
1578 instruct absD_reg(regD dst) %{
1579   predicate((UseSSE>=2) && (UseAVX == 0));
1580   match(Set dst (AbsD dst));
1581   ins_cost(150);
1582   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
1583             "# abs double by sign masking" %}
1584   ins_encode %{
1585     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
1586   %}
1587   ins_pipe(pipe_slow);
1588 %}
1589 
1590 instruct absD_reg_reg(regD dst, regD src) %{
1591   predicate(UseAVX > 0);
1592   match(Set dst (AbsD src));
1593   ins_cost(150);
1594   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
1595             "# abs double by sign masking" %}
1596   ins_encode %{
1597     bool vector256 = false;
1598     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
1599               ExternalAddress(double_signmask()), vector256);
1600   %}
1601   ins_pipe(pipe_slow);
1602 %}
1603 
1604 instruct negF_reg(regF dst) %{
1605   predicate((UseSSE>=1) && (UseAVX == 0));
1606   match(Set dst (NegF dst));
1607   ins_cost(150);
1608   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
1609   ins_encode %{
1610     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
1611   %}
1612   ins_pipe(pipe_slow);
1613 %}
1614 
1615 instruct negF_reg_reg(regF dst, regF src) %{
1616   predicate(UseAVX > 0);
1617   match(Set dst (NegF src));
1618   ins_cost(150);
1619   format %{ "vxorps  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
1620   ins_encode %{
1621     bool vector256 = false;
1622     __ vxorps($dst$$XMMRegister, $src$$XMMRegister,
1623               ExternalAddress(float_signflip()), vector256);
1624   %}
1625   ins_pipe(pipe_slow);
1626 %}
1627 
1628 instruct negD_reg(regD dst) %{
1629   predicate((UseSSE>=2) && (UseAVX == 0));
1630   match(Set dst (NegD dst));
1631   ins_cost(150);
1632   format %{ "xorpd   $dst, [0x8000000000000000]\t"
1633             "# neg double by sign flipping" %}
1634   ins_encode %{
1635     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
1636   %}
1637   ins_pipe(pipe_slow);
1638 %}
1639 
1640 instruct negD_reg_reg(regD dst, regD src) %{
1641   predicate(UseAVX > 0);
1642   match(Set dst (NegD src));
1643   ins_cost(150);
1644   format %{ "vxorpd  $dst, $src, [0x8000000000000000]\t"
1645             "# neg double by sign flipping" %}
1646   ins_encode %{
1647     bool vector256 = false;
1648     __ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
1649               ExternalAddress(double_signflip()), vector256);
1650   %}
1651   ins_pipe(pipe_slow);
1652 %}
1653 
1654 instruct sqrtF_reg(regF dst, regF src) %{
1655   predicate(UseSSE>=1);
1656   match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
1657 
1658   format %{ "sqrtss  $dst, $src" %}
1659   ins_cost(150);
1660   ins_encode %{
1661     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
1662   %}
1663   ins_pipe(pipe_slow);
1664 %}
1665 
1666 instruct sqrtF_mem(regF dst, memory src) %{
1667   predicate(UseSSE>=1);
1668   match(Set dst (ConvD2F (SqrtD (ConvF2D (LoadF src)))));
1669 
1670   format %{ "sqrtss  $dst, $src" %}
1671   ins_cost(150);
1672   ins_encode %{
1673     __ sqrtss($dst$$XMMRegister, $src$$Address);
1674   %}
1675   ins_pipe(pipe_slow);
1676 %}
1677 
1678 instruct sqrtF_imm(regF dst, immF con) %{
1679   predicate(UseSSE>=1);
1680   match(Set dst (ConvD2F (SqrtD (ConvF2D con))));
1681   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
1682   ins_cost(150);
1683   ins_encode %{
1684     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
1685   %}
1686   ins_pipe(pipe_slow);
1687 %}
1688 
1689 instruct sqrtD_reg(regD dst, regD src) %{
1690   predicate(UseSSE>=2);
1691   match(Set dst (SqrtD src));
1692 
1693   format %{ "sqrtsd  $dst, $src" %}
1694   ins_cost(150);
1695   ins_encode %{
1696     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
1697   %}
1698   ins_pipe(pipe_slow);
1699 %}
1700 
1701 instruct sqrtD_mem(regD dst, memory src) %{
1702   predicate(UseSSE>=2);
1703   match(Set dst (SqrtD (LoadD src)));
1704 
1705   format %{ "sqrtsd  $dst, $src" %}
1706   ins_cost(150);
1707   ins_encode %{
1708     __ sqrtsd($dst$$XMMRegister, $src$$Address);
1709   %}
1710   ins_pipe(pipe_slow);
1711 %}
1712 
1713 instruct sqrtD_imm(regD dst, immD con) %{
1714   predicate(UseSSE>=2);
1715   match(Set dst (SqrtD con));
1716   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
1717   ins_cost(150);
1718   ins_encode %{
1719     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
1720   %}
1721   ins_pipe(pipe_slow);
1722 %}
1723 
1724 
1725 // ====================VECTOR INSTRUCTIONS=====================================
1726 
1727 // Load vectors (4 bytes long)
1728 instruct loadV4(vecS dst, memory mem) %{
1729   predicate(n->as_LoadVector()->memory_size() == 4);
1730   match(Set dst (LoadVector mem));
1731   ins_cost(125);
1732   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
1733   ins_encode %{
1734     __ movdl($dst$$XMMRegister, $mem$$Address);
1735   %}
1736   ins_pipe( pipe_slow );
1737 %}
1738 
1739 // Load vectors (8 bytes long)
1740 instruct loadV8(vecD dst, memory mem) %{
1741   predicate(n->as_LoadVector()->memory_size() == 8);
1742   match(Set dst (LoadVector mem));
1743   ins_cost(125);
1744   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
1745   ins_encode %{
1746     __ movq($dst$$XMMRegister, $mem$$Address);
1747   %}
1748   ins_pipe( pipe_slow );
1749 %}
1750 
1751 // Load vectors (16 bytes long)
1752 instruct loadV16(vecX dst, memory mem) %{
1753   predicate(n->as_LoadVector()->memory_size() == 16);
1754   match(Set dst (LoadVector mem));
1755   ins_cost(125);
1756   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
1757   ins_encode %{
1758     __ movdqu($dst$$XMMRegister, $mem$$Address);
1759   %}
1760   ins_pipe( pipe_slow );
1761 %}
1762 
1763 // Load vectors (32 bytes long)
1764 instruct loadV32(vecY dst, memory mem) %{
1765   predicate(n->as_LoadVector()->memory_size() == 32);
1766   match(Set dst (LoadVector mem));
1767   ins_cost(125);
1768   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
1769   ins_encode %{
1770     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
1771   %}
1772   ins_pipe( pipe_slow );
1773 %}
1774 
1775 // Store vectors
1776 instruct storeV4(memory mem, vecS src) %{
1777   predicate(n->as_StoreVector()->memory_size() == 4);
1778   match(Set mem (StoreVector mem src));
1779   ins_cost(145);
1780   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
1781   ins_encode %{
1782     __ movdl($mem$$Address, $src$$XMMRegister);
1783   %}
1784   ins_pipe( pipe_slow );
1785 %}
1786 
1787 instruct storeV8(memory mem, vecD src) %{
1788   predicate(n->as_StoreVector()->memory_size() == 8);
1789   match(Set mem (StoreVector mem src));
1790   ins_cost(145);
1791   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
1792   ins_encode %{
1793     __ movq($mem$$Address, $src$$XMMRegister);
1794   %}
1795   ins_pipe( pipe_slow );
1796 %}
1797 
1798 instruct storeV16(memory mem, vecX src) %{
1799   predicate(n->as_StoreVector()->memory_size() == 16);
1800   match(Set mem (StoreVector mem src));
1801   ins_cost(145);
1802   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
1803   ins_encode %{
1804     __ movdqu($mem$$Address, $src$$XMMRegister);
1805   %}
1806   ins_pipe( pipe_slow );
1807 %}
1808 
1809 instruct storeV32(memory mem, vecY src) %{
1810   predicate(n->as_StoreVector()->memory_size() == 32);
1811   match(Set mem (StoreVector mem src));
1812   ins_cost(145);
1813   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
1814   ins_encode %{
1815     __ vmovdqu($mem$$Address, $src$$XMMRegister);
1816   %}
1817   ins_pipe( pipe_slow );
1818 %}
1819 
1820 // Replicate byte scalar to be vector
1821 instruct Repl4B(vecS dst, rRegI src) %{
1822   predicate(n->as_Vector()->length() == 4);
1823   match(Set dst (ReplicateB src));
1824   format %{ "movd    $dst,$src\n\t"
1825             "punpcklbw $dst,$dst\n\t"
1826             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
1827   ins_encode %{
1828     __ movdl($dst$$XMMRegister, $src$$Register);
1829     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1830     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1831   %}
1832   ins_pipe( pipe_slow );
1833 %}
1834 
1835 instruct Repl8B(vecD dst, rRegI src) %{
1836   predicate(n->as_Vector()->length() == 8);
1837   match(Set dst (ReplicateB src));
1838   format %{ "movd    $dst,$src\n\t"
1839             "punpcklbw $dst,$dst\n\t"
1840             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
1841   ins_encode %{
1842     __ movdl($dst$$XMMRegister, $src$$Register);
1843     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1844     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1845   %}
1846   ins_pipe( pipe_slow );
1847 %}
1848 
1849 instruct Repl16B(vecX dst, rRegI src) %{
1850   predicate(n->as_Vector()->length() == 16);
1851   match(Set dst (ReplicateB src));
1852   format %{ "movd    $dst,$src\n\t"
1853             "punpcklbw $dst,$dst\n\t"
1854             "pshuflw $dst,$dst,0x00\n\t"
1855             "punpcklqdq $dst,$dst\t! replicate16B" %}
1856   ins_encode %{
1857     __ movdl($dst$$XMMRegister, $src$$Register);
1858     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1859     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1860     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1861   %}
1862   ins_pipe( pipe_slow );
1863 %}
1864 
1865 instruct Repl32B(vecY dst, rRegI src) %{
1866   predicate(n->as_Vector()->length() == 32);
1867   match(Set dst (ReplicateB src));
1868   format %{ "movd    $dst,$src\n\t"
1869             "punpcklbw $dst,$dst\n\t"
1870             "pshuflw $dst,$dst,0x00\n\t"
1871             "punpcklqdq $dst,$dst\n\t"
1872             "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
1873   ins_encode %{
1874     __ movdl($dst$$XMMRegister, $src$$Register);
1875     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
1876     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1877     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1878     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
1879   %}
1880   ins_pipe( pipe_slow );
1881 %}
1882 
1883 // Replicate byte scalar immediate to be vector by loading from const table.
1884 instruct Repl4B_imm(vecS dst, immI con) %{
1885   predicate(n->as_Vector()->length() == 4);
1886   match(Set dst (ReplicateB con));
1887   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
1888   ins_encode %{
1889     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
1890   %}
1891   ins_pipe( pipe_slow );
1892 %}
1893 
1894 instruct Repl8B_imm(vecD dst, immI con) %{
1895   predicate(n->as_Vector()->length() == 8);
1896   match(Set dst (ReplicateB con));
1897   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
1898   ins_encode %{
1899     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
1900   %}
1901   ins_pipe( pipe_slow );
1902 %}
1903 
1904 instruct Repl16B_imm(vecX dst, immI con) %{
1905   predicate(n->as_Vector()->length() == 16);
1906   match(Set dst (ReplicateB con));
1907   format %{ "movq    $dst,[$constantaddress]\n\t"
1908             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
1909   ins_encode %{
1910     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
1911     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1912   %}
1913   ins_pipe( pipe_slow );
1914 %}
1915 
1916 instruct Repl32B_imm(vecY dst, immI con) %{
1917   predicate(n->as_Vector()->length() == 32);
1918   match(Set dst (ReplicateB con));
1919   format %{ "movq    $dst,[$constantaddress]\n\t"
1920             "punpcklqdq $dst,$dst\n\t"
1921             "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
1922   ins_encode %{
1923     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
1924     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
1925     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
1926   %}
1927   ins_pipe( pipe_slow );
1928 %}
1929 
1930 // Replicate byte scalar zero to be vector
1931 instruct Repl4B_zero(vecS dst, immI0 zero) %{
1932   predicate(n->as_Vector()->length() == 4);
1933   match(Set dst (ReplicateB zero));
1934   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
1935   ins_encode %{
1936     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1937   %}
1938   ins_pipe( fpu_reg_reg );
1939 %}
1940 
1941 instruct Repl8B_zero(vecD dst, immI0 zero) %{
1942   predicate(n->as_Vector()->length() == 8);
1943   match(Set dst (ReplicateB zero));
1944   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
1945   ins_encode %{
1946     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1947   %}
1948   ins_pipe( fpu_reg_reg );
1949 %}
1950 
1951 instruct Repl16B_zero(vecX dst, immI0 zero) %{
1952   predicate(n->as_Vector()->length() == 16);
1953   match(Set dst (ReplicateB zero));
1954   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
1955   ins_encode %{
1956     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
1957   %}
1958   ins_pipe( fpu_reg_reg );
1959 %}
1960 
1961 instruct Repl32B_zero(vecY dst, immI0 zero) %{
1962   predicate(n->as_Vector()->length() == 32);
1963   match(Set dst (ReplicateB zero));
1964   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
1965   ins_encode %{
1966     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
1967     bool vector256 = true;
1968     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
1969   %}
1970   ins_pipe( fpu_reg_reg );
1971 %}
1972 
1973 // Replicate char/short (2 byte) scalar to be vector
1974 instruct Repl2S(vecS dst, rRegI src) %{
1975   predicate(n->as_Vector()->length() == 2);
1976   match(Set dst (ReplicateS src));
1977   format %{ "movd    $dst,$src\n\t"
1978             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
1979   ins_encode %{
1980     __ movdl($dst$$XMMRegister, $src$$Register);
1981     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1982   %}
1983   ins_pipe( fpu_reg_reg );
1984 %}
1985 
1986 instruct Repl4S(vecD dst, rRegI src) %{
1987   predicate(n->as_Vector()->length() == 4);
1988   match(Set dst (ReplicateS src));
1989   format %{ "movd    $dst,$src\n\t"
1990             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
1991   ins_encode %{
1992     __ movdl($dst$$XMMRegister, $src$$Register);
1993     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
1994   %}
1995   ins_pipe( fpu_reg_reg );
1996 %}
1997 
1998 instruct Repl8S(vecX dst, rRegI src) %{
1999   predicate(n->as_Vector()->length() == 8);
2000   match(Set dst (ReplicateS src));
2001   format %{ "movd    $dst,$src\n\t"
2002             "pshuflw $dst,$dst,0x00\n\t"
2003             "punpcklqdq $dst,$dst\t! replicate8S" %}
2004   ins_encode %{
2005     __ movdl($dst$$XMMRegister, $src$$Register);
2006     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2007     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2008   %}
2009   ins_pipe( pipe_slow );
2010 %}
2011 
2012 instruct Repl16S(vecY dst, rRegI src) %{
2013   predicate(n->as_Vector()->length() == 16);
2014   match(Set dst (ReplicateS src));
2015   format %{ "movd    $dst,$src\n\t"
2016             "pshuflw $dst,$dst,0x00\n\t"
2017             "punpcklqdq $dst,$dst\n\t"
2018             "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
2019   ins_encode %{
2020     __ movdl($dst$$XMMRegister, $src$$Register);
2021     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2022     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2023     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2024   %}
2025   ins_pipe( pipe_slow );
2026 %}
2027 
2028 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
2029 instruct Repl2S_imm(vecS dst, immI con) %{
2030   predicate(n->as_Vector()->length() == 2);
2031   match(Set dst (ReplicateS con));
2032   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
2033   ins_encode %{
2034     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
2035   %}
2036   ins_pipe( fpu_reg_reg );
2037 %}
2038 
2039 instruct Repl4S_imm(vecD dst, immI con) %{
2040   predicate(n->as_Vector()->length() == 4);
2041   match(Set dst (ReplicateS con));
2042   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
2043   ins_encode %{
2044     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
2045   %}
2046   ins_pipe( fpu_reg_reg );
2047 %}
2048 
2049 instruct Repl8S_imm(vecX dst, immI con) %{
2050   predicate(n->as_Vector()->length() == 8);
2051   match(Set dst (ReplicateS con));
2052   format %{ "movq    $dst,[$constantaddress]\n\t"
2053             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
2054   ins_encode %{
2055     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
2056     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2057   %}
2058   ins_pipe( pipe_slow );
2059 %}
2060 
2061 instruct Repl16S_imm(vecY dst, immI con) %{
2062   predicate(n->as_Vector()->length() == 16);
2063   match(Set dst (ReplicateS con));
2064   format %{ "movq    $dst,[$constantaddress]\n\t"
2065             "punpcklqdq $dst,$dst\n\t"
2066             "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
2067   ins_encode %{
2068     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
2069     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2070     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2071   %}
2072   ins_pipe( pipe_slow );
2073 %}
2074 
2075 // Replicate char/short (2 byte) scalar zero to be vector
2076 instruct Repl2S_zero(vecS dst, immI0 zero) %{
2077   predicate(n->as_Vector()->length() == 2);
2078   match(Set dst (ReplicateS zero));
2079   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
2080   ins_encode %{
2081     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2082   %}
2083   ins_pipe( fpu_reg_reg );
2084 %}
2085 
2086 instruct Repl4S_zero(vecD dst, immI0 zero) %{
2087   predicate(n->as_Vector()->length() == 4);
2088   match(Set dst (ReplicateS zero));
2089   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
2090   ins_encode %{
2091     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2092   %}
2093   ins_pipe( fpu_reg_reg );
2094 %}
2095 
2096 instruct Repl8S_zero(vecX dst, immI0 zero) %{
2097   predicate(n->as_Vector()->length() == 8);
2098   match(Set dst (ReplicateS zero));
2099   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
2100   ins_encode %{
2101     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2102   %}
2103   ins_pipe( fpu_reg_reg );
2104 %}
2105 
2106 instruct Repl16S_zero(vecY dst, immI0 zero) %{
2107   predicate(n->as_Vector()->length() == 16);
2108   match(Set dst (ReplicateS zero));
2109   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
2110   ins_encode %{
2111     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
2112     bool vector256 = true;
2113     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2114   %}
2115   ins_pipe( fpu_reg_reg );
2116 %}
2117 
2118 // Replicate integer (4 byte) scalar to be vector
2119 instruct Repl2I(vecD dst, rRegI src) %{
2120   predicate(n->as_Vector()->length() == 2);
2121   match(Set dst (ReplicateI src));
2122   format %{ "movd    $dst,$src\n\t"
2123             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
2124   ins_encode %{
2125     __ movdl($dst$$XMMRegister, $src$$Register);
2126     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2127   %}
2128   ins_pipe( fpu_reg_reg );
2129 %}
2130 
2131 instruct Repl4I(vecX dst, rRegI src) %{
2132   predicate(n->as_Vector()->length() == 4);
2133   match(Set dst (ReplicateI src));
2134   format %{ "movd    $dst,$src\n\t"
2135             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
2136   ins_encode %{
2137     __ movdl($dst$$XMMRegister, $src$$Register);
2138     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2139   %}
2140   ins_pipe( pipe_slow );
2141 %}
2142 
2143 instruct Repl8I(vecY dst, rRegI src) %{
2144   predicate(n->as_Vector()->length() == 8);
2145   match(Set dst (ReplicateI src));
2146   format %{ "movd    $dst,$src\n\t"
2147             "pshufd  $dst,$dst,0x00\n\t"
2148             "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
2149   ins_encode %{
2150     __ movdl($dst$$XMMRegister, $src$$Register);
2151     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2152     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2153   %}
2154   ins_pipe( pipe_slow );
2155 %}
2156 
2157 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
2158 instruct Repl2I_imm(vecD dst, immI con) %{
2159   predicate(n->as_Vector()->length() == 2);
2160   match(Set dst (ReplicateI con));
2161   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
2162   ins_encode %{
2163     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
2164   %}
2165   ins_pipe( fpu_reg_reg );
2166 %}
2167 
2168 instruct Repl4I_imm(vecX dst, immI con) %{
2169   predicate(n->as_Vector()->length() == 4);
2170   match(Set dst (ReplicateI con));
2171   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
2172             "punpcklqdq $dst,$dst" %}
2173   ins_encode %{
2174     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
2175     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2176   %}
2177   ins_pipe( pipe_slow );
2178 %}
2179 
2180 instruct Repl8I_imm(vecY dst, immI con) %{
2181   predicate(n->as_Vector()->length() == 8);
2182   match(Set dst (ReplicateI con));
2183   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
2184             "punpcklqdq $dst,$dst\n\t"
2185             "vinserti128h $dst,$dst,$dst" %}
2186   ins_encode %{
2187     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
2188     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2189     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2190   %}
2191   ins_pipe( pipe_slow );
2192 %}
2193 
2194 // Integer could be loaded into xmm register directly from memory.
2195 instruct Repl2I_mem(vecD dst, memory mem) %{
2196   predicate(n->as_Vector()->length() == 2);
2197   match(Set dst (ReplicateI (LoadI mem)));
2198   format %{ "movd    $dst,$mem\n\t"
2199             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
2200   ins_encode %{
2201     __ movdl($dst$$XMMRegister, $mem$$Address);
2202     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2203   %}
2204   ins_pipe( fpu_reg_reg );
2205 %}
2206 
2207 instruct Repl4I_mem(vecX dst, memory mem) %{
2208   predicate(n->as_Vector()->length() == 4);
2209   match(Set dst (ReplicateI (LoadI mem)));
2210   format %{ "movd    $dst,$mem\n\t"
2211             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
2212   ins_encode %{
2213     __ movdl($dst$$XMMRegister, $mem$$Address);
2214     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2215   %}
2216   ins_pipe( pipe_slow );
2217 %}
2218 
2219 instruct Repl8I_mem(vecY dst, memory mem) %{
2220   predicate(n->as_Vector()->length() == 8);
2221   match(Set dst (ReplicateI (LoadI mem)));
2222   format %{ "movd    $dst,$mem\n\t"
2223             "pshufd  $dst,$dst,0x00\n\t"
2224             "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
2225   ins_encode %{
2226     __ movdl($dst$$XMMRegister, $mem$$Address);
2227     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
2228     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2229   %}
2230   ins_pipe( pipe_slow );
2231 %}
2232 
2233 // Replicate integer (4 byte) scalar zero to be vector
2234 instruct Repl2I_zero(vecD dst, immI0 zero) %{
2235   predicate(n->as_Vector()->length() == 2);
2236   match(Set dst (ReplicateI zero));
2237   format %{ "pxor    $dst,$dst\t! replicate2I" %}
2238   ins_encode %{
2239     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2240   %}
2241   ins_pipe( fpu_reg_reg );
2242 %}
2243 
2244 instruct Repl4I_zero(vecX dst, immI0 zero) %{
2245   predicate(n->as_Vector()->length() == 4);
2246   match(Set dst (ReplicateI zero));
2247   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
2248   ins_encode %{
2249     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2250   %}
2251   ins_pipe( fpu_reg_reg );
2252 %}
2253 
2254 instruct Repl8I_zero(vecY dst, immI0 zero) %{
2255   predicate(n->as_Vector()->length() == 8);
2256   match(Set dst (ReplicateI zero));
2257   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
2258   ins_encode %{
2259     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
2260     bool vector256 = true;
2261     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2262   %}
2263   ins_pipe( fpu_reg_reg );
2264 %}
2265 
2266 // Replicate long (8 byte) scalar to be vector
2267 #ifdef _LP64
2268 instruct Repl2L(vecX dst, rRegL src) %{
2269   predicate(n->as_Vector()->length() == 2);
2270   match(Set dst (ReplicateL src));
2271   format %{ "movdq   $dst,$src\n\t"
2272             "punpcklqdq $dst,$dst\t! replicate2L" %}
2273   ins_encode %{
2274     __ movdq($dst$$XMMRegister, $src$$Register);
2275     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2276   %}
2277   ins_pipe( pipe_slow );
2278 %}
2279 
2280 instruct Repl4L(vecY dst, rRegL src) %{
2281   predicate(n->as_Vector()->length() == 4);
2282   match(Set dst (ReplicateL src));
2283   format %{ "movdq   $dst,$src\n\t"
2284             "punpcklqdq $dst,$dst\n\t"
2285             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
2286   ins_encode %{
2287     __ movdq($dst$$XMMRegister, $src$$Register);
2288     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2289     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2290   %}
2291   ins_pipe( pipe_slow );
2292 %}
2293 #else // _LP64
2294 instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
2295   predicate(n->as_Vector()->length() == 2);
2296   match(Set dst (ReplicateL src));
2297   effect(TEMP dst, USE src, TEMP tmp);
2298   format %{ "movdl   $dst,$src.lo\n\t"
2299             "movdl   $tmp,$src.hi\n\t"
2300             "punpckldq $dst,$tmp\n\t"
2301             "punpcklqdq $dst,$dst\t! replicate2L"%}
2302   ins_encode %{
2303     __ movdl($dst$$XMMRegister, $src$$Register);
2304     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
2305     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
2306     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2307   %}
2308   ins_pipe( pipe_slow );
2309 %}
2310 
2311 instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
2312   predicate(n->as_Vector()->length() == 4);
2313   match(Set dst (ReplicateL src));
2314   effect(TEMP dst, USE src, TEMP tmp);
2315   format %{ "movdl   $dst,$src.lo\n\t"
2316             "movdl   $tmp,$src.hi\n\t"
2317             "punpckldq $dst,$tmp\n\t"
2318             "punpcklqdq $dst,$dst\n\t"
2319             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
2320   ins_encode %{
2321     __ movdl($dst$$XMMRegister, $src$$Register);
2322     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
2323     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
2324     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2325     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2326   %}
2327   ins_pipe( pipe_slow );
2328 %}
2329 #endif // _LP64
2330 
2331 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
2332 instruct Repl2L_imm(vecX dst, immL con) %{
2333   predicate(n->as_Vector()->length() == 2);
2334   match(Set dst (ReplicateL con));
2335   format %{ "movq    $dst,[$constantaddress]\n\t"
2336             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
2337   ins_encode %{
2338     __ movq($dst$$XMMRegister, $constantaddress($con));
2339     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2340   %}
2341   ins_pipe( pipe_slow );
2342 %}
2343 
2344 instruct Repl4L_imm(vecY dst, immL con) %{
2345   predicate(n->as_Vector()->length() == 4);
2346   match(Set dst (ReplicateL con));
2347   format %{ "movq    $dst,[$constantaddress]\n\t"
2348             "punpcklqdq $dst,$dst\n\t"
2349             "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
2350   ins_encode %{
2351     __ movq($dst$$XMMRegister, $constantaddress($con));
2352     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2353     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2354   %}
2355   ins_pipe( pipe_slow );
2356 %}
2357 
2358 // Long could be loaded into xmm register directly from memory.
2359 instruct Repl2L_mem(vecX dst, memory mem) %{
2360   predicate(n->as_Vector()->length() == 2);
2361   match(Set dst (ReplicateL (LoadL mem)));
2362   format %{ "movq    $dst,$mem\n\t"
2363             "punpcklqdq $dst,$dst\t! replicate2L" %}
2364   ins_encode %{
2365     __ movq($dst$$XMMRegister, $mem$$Address);
2366     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2367   %}
2368   ins_pipe( pipe_slow );
2369 %}
2370 
2371 instruct Repl4L_mem(vecY dst, memory mem) %{
2372   predicate(n->as_Vector()->length() == 4);
2373   match(Set dst (ReplicateL (LoadL mem)));
2374   format %{ "movq    $dst,$mem\n\t"
2375             "punpcklqdq $dst,$dst\n\t"
2376             "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
2377   ins_encode %{
2378     __ movq($dst$$XMMRegister, $mem$$Address);
2379     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
2380     __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2381   %}
2382   ins_pipe( pipe_slow );
2383 %}
2384 
2385 // Replicate long (8 byte) scalar zero to be vector
2386 instruct Repl2L_zero(vecX dst, immL0 zero) %{
2387   predicate(n->as_Vector()->length() == 2);
2388   match(Set dst (ReplicateL zero));
2389   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
2390   ins_encode %{
2391     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
2392   %}
2393   ins_pipe( fpu_reg_reg );
2394 %}
2395 
2396 instruct Repl4L_zero(vecY dst, immL0 zero) %{
2397   predicate(n->as_Vector()->length() == 4);
2398   match(Set dst (ReplicateL zero));
2399   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
2400   ins_encode %{
2401     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
2402     bool vector256 = true;
2403     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2404   %}
2405   ins_pipe( fpu_reg_reg );
2406 %}
2407 
2408 // Replicate float (4 byte) scalar to be vector
2409 instruct Repl2F(vecD dst, regF src) %{
2410   predicate(n->as_Vector()->length() == 2);
2411   match(Set dst (ReplicateF src));
2412   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
2413   ins_encode %{
2414     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
2415   %}
2416   ins_pipe( fpu_reg_reg );
2417 %}
2418 
2419 instruct Repl4F(vecX dst, regF src) %{
2420   predicate(n->as_Vector()->length() == 4);
2421   match(Set dst (ReplicateF src));
2422   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
2423   ins_encode %{
2424     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
2425   %}
2426   ins_pipe( pipe_slow );
2427 %}
2428 
2429 instruct Repl8F(vecY dst, regF src) %{
2430   predicate(n->as_Vector()->length() == 8);
2431   match(Set dst (ReplicateF src));
2432   format %{ "pshufd  $dst,$src,0x00\n\t"
2433             "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
2434   ins_encode %{
2435     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
2436     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2437   %}
2438   ins_pipe( pipe_slow );
2439 %}
2440 
2441 // Replicate float (4 byte) scalar zero to be vector
2442 instruct Repl2F_zero(vecD dst, immF0 zero) %{
2443   predicate(n->as_Vector()->length() == 2);
2444   match(Set dst (ReplicateF zero));
2445   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
2446   ins_encode %{
2447     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
2448   %}
2449   ins_pipe( fpu_reg_reg );
2450 %}
2451 
2452 instruct Repl4F_zero(vecX dst, immF0 zero) %{
2453   predicate(n->as_Vector()->length() == 4);
2454   match(Set dst (ReplicateF zero));
2455   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
2456   ins_encode %{
2457     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
2458   %}
2459   ins_pipe( fpu_reg_reg );
2460 %}
2461 
2462 instruct Repl8F_zero(vecY dst, immF0 zero) %{
2463   predicate(n->as_Vector()->length() == 8);
2464   match(Set dst (ReplicateF zero));
2465   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
2466   ins_encode %{
2467     bool vector256 = true;
2468     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2469   %}
2470   ins_pipe( fpu_reg_reg );
2471 %}
2472 
2473 // Replicate double (8 bytes) scalar to be vector
2474 instruct Repl2D(vecX dst, regD src) %{
2475   predicate(n->as_Vector()->length() == 2);
2476   match(Set dst (ReplicateD src));
2477   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
2478   ins_encode %{
2479     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
2480   %}
2481   ins_pipe( pipe_slow );
2482 %}
2483 
2484 instruct Repl4D(vecY dst, regD src) %{
2485   predicate(n->as_Vector()->length() == 4);
2486   match(Set dst (ReplicateD src));
2487   format %{ "pshufd  $dst,$src,0x44\n\t"
2488             "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
2489   ins_encode %{
2490     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
2491     __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
2492   %}
2493   ins_pipe( pipe_slow );
2494 %}
2495 
2496 // Replicate double (8 byte) scalar zero to be vector
2497 instruct Repl2D_zero(vecX dst, immD0 zero) %{
2498   predicate(n->as_Vector()->length() == 2);
2499   match(Set dst (ReplicateD zero));
2500   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
2501   ins_encode %{
2502     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
2503   %}
2504   ins_pipe( fpu_reg_reg );
2505 %}
2506 
2507 instruct Repl4D_zero(vecY dst, immD0 zero) %{
2508   predicate(n->as_Vector()->length() == 4);
2509   match(Set dst (ReplicateD zero));
2510   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
2511   ins_encode %{
2512     bool vector256 = true;
2513     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
2514   %}
2515   ins_pipe( fpu_reg_reg );
2516 %}
2517 
2518 // ====================VECTOR ARITHMETIC=======================================
2519 
2520 // --------------------------------- ADD --------------------------------------
2521 
2522 // Bytes vector add
2523 instruct vadd4B(vecS dst, vecS src) %{
2524   predicate(n->as_Vector()->length() == 4);
2525   match(Set dst (AddVB dst src));
2526   format %{ "paddb   $dst,$src\t! add packed4B" %}
2527   ins_encode %{
2528     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
2529   %}
2530   ins_pipe( pipe_slow );
2531 %}
2532 
2533 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
2534   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2535   match(Set dst (AddVB src1 src2));
2536   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
2537   ins_encode %{
2538     bool vector256 = false;
2539     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2540   %}
2541   ins_pipe( pipe_slow );
2542 %}
2543 
2544 instruct vadd8B(vecD dst, vecD src) %{
2545   predicate(n->as_Vector()->length() == 8);
2546   match(Set dst (AddVB dst src));
2547   format %{ "paddb   $dst,$src\t! add packed8B" %}
2548   ins_encode %{
2549     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
2550   %}
2551   ins_pipe( pipe_slow );
2552 %}
2553 
2554 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
2555   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
2556   match(Set dst (AddVB src1 src2));
2557   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
2558   ins_encode %{
2559     bool vector256 = false;
2560     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2561   %}
2562   ins_pipe( pipe_slow );
2563 %}
2564 
2565 instruct vadd16B(vecX dst, vecX src) %{
2566   predicate(n->as_Vector()->length() == 16);
2567   match(Set dst (AddVB dst src));
2568   format %{ "paddb   $dst,$src\t! add packed16B" %}
2569   ins_encode %{
2570     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
2571   %}
2572   ins_pipe( pipe_slow );
2573 %}
2574 
2575 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
2576   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
2577   match(Set dst (AddVB src1 src2));
2578   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
2579   ins_encode %{
2580     bool vector256 = false;
2581     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2582   %}
2583   ins_pipe( pipe_slow );
2584 %}
2585 
2586 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
2587   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
2588   match(Set dst (AddVB src (LoadVector mem)));
2589   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
2590   ins_encode %{
2591     bool vector256 = false;
2592     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2593   %}
2594   ins_pipe( pipe_slow );
2595 %}
2596 
2597 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
2598   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
2599   match(Set dst (AddVB src1 src2));
2600   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
2601   ins_encode %{
2602     bool vector256 = true;
2603     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2604   %}
2605   ins_pipe( pipe_slow );
2606 %}
2607 
2608 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
2609   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
2610   match(Set dst (AddVB src (LoadVector mem)));
2611   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
2612   ins_encode %{
2613     bool vector256 = true;
2614     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2615   %}
2616   ins_pipe( pipe_slow );
2617 %}
2618 
2619 // Shorts/Chars vector add
2620 instruct vadd2S(vecS dst, vecS src) %{
2621   predicate(n->as_Vector()->length() == 2);
2622   match(Set dst (AddVS dst src));
2623   format %{ "paddw   $dst,$src\t! add packed2S" %}
2624   ins_encode %{
2625     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
2626   %}
2627   ins_pipe( pipe_slow );
2628 %}
2629 
2630 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
2631   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2632   match(Set dst (AddVS src1 src2));
2633   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
2634   ins_encode %{
2635     bool vector256 = false;
2636     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2637   %}
2638   ins_pipe( pipe_slow );
2639 %}
2640 
2641 instruct vadd4S(vecD dst, vecD src) %{
2642   predicate(n->as_Vector()->length() == 4);
2643   match(Set dst (AddVS dst src));
2644   format %{ "paddw   $dst,$src\t! add packed4S" %}
2645   ins_encode %{
2646     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
2647   %}
2648   ins_pipe( pipe_slow );
2649 %}
2650 
2651 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
2652   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2653   match(Set dst (AddVS src1 src2));
2654   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
2655   ins_encode %{
2656     bool vector256 = false;
2657     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2658   %}
2659   ins_pipe( pipe_slow );
2660 %}
2661 
2662 instruct vadd8S(vecX dst, vecX src) %{
2663   predicate(n->as_Vector()->length() == 8);
2664   match(Set dst (AddVS dst src));
2665   format %{ "paddw   $dst,$src\t! add packed8S" %}
2666   ins_encode %{
2667     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
2668   %}
2669   ins_pipe( pipe_slow );
2670 %}
2671 
2672 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
2673   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
2674   match(Set dst (AddVS src1 src2));
2675   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
2676   ins_encode %{
2677     bool vector256 = false;
2678     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2679   %}
2680   ins_pipe( pipe_slow );
2681 %}
2682 
2683 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
2684   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
2685   match(Set dst (AddVS src (LoadVector mem)));
2686   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
2687   ins_encode %{
2688     bool vector256 = false;
2689     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2690   %}
2691   ins_pipe( pipe_slow );
2692 %}
2693 
2694 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
2695   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
2696   match(Set dst (AddVS src1 src2));
2697   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
2698   ins_encode %{
2699     bool vector256 = true;
2700     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2701   %}
2702   ins_pipe( pipe_slow );
2703 %}
2704 
2705 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
2706   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
2707   match(Set dst (AddVS src (LoadVector mem)));
2708   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
2709   ins_encode %{
2710     bool vector256 = true;
2711     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2712   %}
2713   ins_pipe( pipe_slow );
2714 %}
2715 
2716 // Integers vector add
2717 instruct vadd2I(vecD dst, vecD src) %{
2718   predicate(n->as_Vector()->length() == 2);
2719   match(Set dst (AddVI dst src));
2720   format %{ "paddd   $dst,$src\t! add packed2I" %}
2721   ins_encode %{
2722     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
2723   %}
2724   ins_pipe( pipe_slow );
2725 %}
2726 
2727 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
2728   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2729   match(Set dst (AddVI src1 src2));
2730   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
2731   ins_encode %{
2732     bool vector256 = false;
2733     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2734   %}
2735   ins_pipe( pipe_slow );
2736 %}
2737 
2738 instruct vadd4I(vecX dst, vecX src) %{
2739   predicate(n->as_Vector()->length() == 4);
2740   match(Set dst (AddVI dst src));
2741   format %{ "paddd   $dst,$src\t! add packed4I" %}
2742   ins_encode %{
2743     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
2744   %}
2745   ins_pipe( pipe_slow );
2746 %}
2747 
2748 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
2749   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2750   match(Set dst (AddVI src1 src2));
2751   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
2752   ins_encode %{
2753     bool vector256 = false;
2754     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2755   %}
2756   ins_pipe( pipe_slow );
2757 %}
2758 
2759 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
2760   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2761   match(Set dst (AddVI src (LoadVector mem)));
2762   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
2763   ins_encode %{
2764     bool vector256 = false;
2765     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2766   %}
2767   ins_pipe( pipe_slow );
2768 %}
2769 
2770 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
2771   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
2772   match(Set dst (AddVI src1 src2));
2773   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
2774   ins_encode %{
2775     bool vector256 = true;
2776     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2777   %}
2778   ins_pipe( pipe_slow );
2779 %}
2780 
2781 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
2782   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
2783   match(Set dst (AddVI src (LoadVector mem)));
2784   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
2785   ins_encode %{
2786     bool vector256 = true;
2787     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2788   %}
2789   ins_pipe( pipe_slow );
2790 %}
2791 
2792 // Longs vector add
2793 instruct vadd2L(vecX dst, vecX src) %{
2794   predicate(n->as_Vector()->length() == 2);
2795   match(Set dst (AddVL dst src));
2796   format %{ "paddq   $dst,$src\t! add packed2L" %}
2797   ins_encode %{
2798     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
2799   %}
2800   ins_pipe( pipe_slow );
2801 %}
2802 
2803 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
2804   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2805   match(Set dst (AddVL src1 src2));
2806   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
2807   ins_encode %{
2808     bool vector256 = false;
2809     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2810   %}
2811   ins_pipe( pipe_slow );
2812 %}
2813 
2814 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
2815   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2816   match(Set dst (AddVL src (LoadVector mem)));
2817   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
2818   ins_encode %{
2819     bool vector256 = false;
2820     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2821   %}
2822   ins_pipe( pipe_slow );
2823 %}
2824 
2825 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
2826   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
2827   match(Set dst (AddVL src1 src2));
2828   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
2829   ins_encode %{
2830     bool vector256 = true;
2831     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2832   %}
2833   ins_pipe( pipe_slow );
2834 %}
2835 
2836 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
2837   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
2838   match(Set dst (AddVL src (LoadVector mem)));
2839   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
2840   ins_encode %{
2841     bool vector256 = true;
2842     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2843   %}
2844   ins_pipe( pipe_slow );
2845 %}
2846 
2847 // Floats vector add
2848 instruct vadd2F(vecD dst, vecD src) %{
2849   predicate(n->as_Vector()->length() == 2);
2850   match(Set dst (AddVF dst src));
2851   format %{ "addps   $dst,$src\t! add packed2F" %}
2852   ins_encode %{
2853     __ addps($dst$$XMMRegister, $src$$XMMRegister);
2854   %}
2855   ins_pipe( pipe_slow );
2856 %}
2857 
2858 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
2859   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2860   match(Set dst (AddVF src1 src2));
2861   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
2862   ins_encode %{
2863     bool vector256 = false;
2864     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2865   %}
2866   ins_pipe( pipe_slow );
2867 %}
2868 
2869 instruct vadd4F(vecX dst, vecX src) %{
2870   predicate(n->as_Vector()->length() == 4);
2871   match(Set dst (AddVF dst src));
2872   format %{ "addps   $dst,$src\t! add packed4F" %}
2873   ins_encode %{
2874     __ addps($dst$$XMMRegister, $src$$XMMRegister);
2875   %}
2876   ins_pipe( pipe_slow );
2877 %}
2878 
2879 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
2880   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2881   match(Set dst (AddVF src1 src2));
2882   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
2883   ins_encode %{
2884     bool vector256 = false;
2885     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2886   %}
2887   ins_pipe( pipe_slow );
2888 %}
2889 
2890 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
2891   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2892   match(Set dst (AddVF src (LoadVector mem)));
2893   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
2894   ins_encode %{
2895     bool vector256 = false;
2896     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2897   %}
2898   ins_pipe( pipe_slow );
2899 %}
2900 
2901 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
2902   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
2903   match(Set dst (AddVF src1 src2));
2904   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
2905   ins_encode %{
2906     bool vector256 = true;
2907     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2908   %}
2909   ins_pipe( pipe_slow );
2910 %}
2911 
2912 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
2913   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
2914   match(Set dst (AddVF src (LoadVector mem)));
2915   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
2916   ins_encode %{
2917     bool vector256 = true;
2918     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2919   %}
2920   ins_pipe( pipe_slow );
2921 %}
2922 
2923 // Doubles vector add
2924 instruct vadd2D(vecX dst, vecX src) %{
2925   predicate(n->as_Vector()->length() == 2);
2926   match(Set dst (AddVD dst src));
2927   format %{ "addpd   $dst,$src\t! add packed2D" %}
2928   ins_encode %{
2929     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
2930   %}
2931   ins_pipe( pipe_slow );
2932 %}
2933 
2934 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
2935   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2936   match(Set dst (AddVD src1 src2));
2937   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
2938   ins_encode %{
2939     bool vector256 = false;
2940     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2941   %}
2942   ins_pipe( pipe_slow );
2943 %}
2944 
2945 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
2946   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
2947   match(Set dst (AddVD src (LoadVector mem)));
2948   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
2949   ins_encode %{
2950     bool vector256 = false;
2951     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2952   %}
2953   ins_pipe( pipe_slow );
2954 %}
2955 
2956 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
2957   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2958   match(Set dst (AddVD src1 src2));
2959   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
2960   ins_encode %{
2961     bool vector256 = true;
2962     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2963   %}
2964   ins_pipe( pipe_slow );
2965 %}
2966 
2967 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
2968   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2969   match(Set dst (AddVD src (LoadVector mem)));
2970   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
2971   ins_encode %{
2972     bool vector256 = true;
2973     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
2974   %}
2975   ins_pipe( pipe_slow );
2976 %}
2977 
2978 // --------------------------------- SUB --------------------------------------
2979 
2980 // Bytes vector sub
2981 instruct vsub4B(vecS dst, vecS src) %{
2982   predicate(n->as_Vector()->length() == 4);
2983   match(Set dst (SubVB dst src));
2984   format %{ "psubb   $dst,$src\t! sub packed4B" %}
2985   ins_encode %{
2986     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
2987   %}
2988   ins_pipe( pipe_slow );
2989 %}
2990 
2991 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
2992   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
2993   match(Set dst (SubVB src1 src2));
2994   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
2995   ins_encode %{
2996     bool vector256 = false;
2997     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
2998   %}
2999   ins_pipe( pipe_slow );
3000 %}
3001 
3002 instruct vsub8B(vecD dst, vecD src) %{
3003   predicate(n->as_Vector()->length() == 8);
3004   match(Set dst (SubVB dst src));
3005   format %{ "psubb   $dst,$src\t! sub packed8B" %}
3006   ins_encode %{
3007     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
3008   %}
3009   ins_pipe( pipe_slow );
3010 %}
3011 
3012 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
3013   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3014   match(Set dst (SubVB src1 src2));
3015   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
3016   ins_encode %{
3017     bool vector256 = false;
3018     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3019   %}
3020   ins_pipe( pipe_slow );
3021 %}
3022 
3023 instruct vsub16B(vecX dst, vecX src) %{
3024   predicate(n->as_Vector()->length() == 16);
3025   match(Set dst (SubVB dst src));
3026   format %{ "psubb   $dst,$src\t! sub packed16B" %}
3027   ins_encode %{
3028     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
3029   %}
3030   ins_pipe( pipe_slow );
3031 %}
3032 
3033 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
3034   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
3035   match(Set dst (SubVB src1 src2));
3036   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
3037   ins_encode %{
3038     bool vector256 = false;
3039     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3040   %}
3041   ins_pipe( pipe_slow );
3042 %}
3043 
3044 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
3045   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
3046   match(Set dst (SubVB src (LoadVector mem)));
3047   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
3048   ins_encode %{
3049     bool vector256 = false;
3050     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3051   %}
3052   ins_pipe( pipe_slow );
3053 %}
3054 
3055 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
3056   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
3057   match(Set dst (SubVB src1 src2));
3058   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
3059   ins_encode %{
3060     bool vector256 = true;
3061     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3062   %}
3063   ins_pipe( pipe_slow );
3064 %}
3065 
3066 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
3067   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
3068   match(Set dst (SubVB src (LoadVector mem)));
3069   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
3070   ins_encode %{
3071     bool vector256 = true;
3072     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3073   %}
3074   ins_pipe( pipe_slow );
3075 %}
3076 
3077 // Shorts/Chars vector sub
3078 instruct vsub2S(vecS dst, vecS src) %{
3079   predicate(n->as_Vector()->length() == 2);
3080   match(Set dst (SubVS dst src));
3081   format %{ "psubw   $dst,$src\t! sub packed2S" %}
3082   ins_encode %{
3083     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
3084   %}
3085   ins_pipe( pipe_slow );
3086 %}
3087 
3088 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
3089   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3090   match(Set dst (SubVS src1 src2));
3091   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
3092   ins_encode %{
3093     bool vector256 = false;
3094     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3095   %}
3096   ins_pipe( pipe_slow );
3097 %}
3098 
3099 instruct vsub4S(vecD dst, vecD src) %{
3100   predicate(n->as_Vector()->length() == 4);
3101   match(Set dst (SubVS dst src));
3102   format %{ "psubw   $dst,$src\t! sub packed4S" %}
3103   ins_encode %{
3104     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
3105   %}
3106   ins_pipe( pipe_slow );
3107 %}
3108 
3109 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
3110   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3111   match(Set dst (SubVS src1 src2));
3112   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
3113   ins_encode %{
3114     bool vector256 = false;
3115     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3116   %}
3117   ins_pipe( pipe_slow );
3118 %}
3119 
3120 instruct vsub8S(vecX dst, vecX src) %{
3121   predicate(n->as_Vector()->length() == 8);
3122   match(Set dst (SubVS dst src));
3123   format %{ "psubw   $dst,$src\t! sub packed8S" %}
3124   ins_encode %{
3125     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
3126   %}
3127   ins_pipe( pipe_slow );
3128 %}
3129 
3130 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
3131   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3132   match(Set dst (SubVS src1 src2));
3133   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
3134   ins_encode %{
3135     bool vector256 = false;
3136     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3137   %}
3138   ins_pipe( pipe_slow );
3139 %}
3140 
3141 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
3142   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3143   match(Set dst (SubVS src (LoadVector mem)));
3144   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
3145   ins_encode %{
3146     bool vector256 = false;
3147     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3148   %}
3149   ins_pipe( pipe_slow );
3150 %}
3151 
3152 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
3153   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3154   match(Set dst (SubVS src1 src2));
3155   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
3156   ins_encode %{
3157     bool vector256 = true;
3158     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3159   %}
3160   ins_pipe( pipe_slow );
3161 %}
3162 
3163 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
3164   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3165   match(Set dst (SubVS src (LoadVector mem)));
3166   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
3167   ins_encode %{
3168     bool vector256 = true;
3169     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3170   %}
3171   ins_pipe( pipe_slow );
3172 %}
3173 
3174 // Integers vector sub
3175 instruct vsub2I(vecD dst, vecD src) %{
3176   predicate(n->as_Vector()->length() == 2);
3177   match(Set dst (SubVI dst src));
3178   format %{ "psubd   $dst,$src\t! sub packed2I" %}
3179   ins_encode %{
3180     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
3181   %}
3182   ins_pipe( pipe_slow );
3183 %}
3184 
3185 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
3186   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3187   match(Set dst (SubVI src1 src2));
3188   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
3189   ins_encode %{
3190     bool vector256 = false;
3191     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3192   %}
3193   ins_pipe( pipe_slow );
3194 %}
3195 
3196 instruct vsub4I(vecX dst, vecX src) %{
3197   predicate(n->as_Vector()->length() == 4);
3198   match(Set dst (SubVI dst src));
3199   format %{ "psubd   $dst,$src\t! sub packed4I" %}
3200   ins_encode %{
3201     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
3202   %}
3203   ins_pipe( pipe_slow );
3204 %}
3205 
3206 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
3207   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3208   match(Set dst (SubVI src1 src2));
3209   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
3210   ins_encode %{
3211     bool vector256 = false;
3212     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3213   %}
3214   ins_pipe( pipe_slow );
3215 %}
3216 
3217 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
3218   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3219   match(Set dst (SubVI src (LoadVector mem)));
3220   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
3221   ins_encode %{
3222     bool vector256 = false;
3223     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3224   %}
3225   ins_pipe( pipe_slow );
3226 %}
3227 
3228 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
3229   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3230   match(Set dst (SubVI src1 src2));
3231   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
3232   ins_encode %{
3233     bool vector256 = true;
3234     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3235   %}
3236   ins_pipe( pipe_slow );
3237 %}
3238 
3239 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
3240   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3241   match(Set dst (SubVI src (LoadVector mem)));
3242   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
3243   ins_encode %{
3244     bool vector256 = true;
3245     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3246   %}
3247   ins_pipe( pipe_slow );
3248 %}
3249 
3250 // Longs vector sub
3251 instruct vsub2L(vecX dst, vecX src) %{
3252   predicate(n->as_Vector()->length() == 2);
3253   match(Set dst (SubVL dst src));
3254   format %{ "psubq   $dst,$src\t! sub packed2L" %}
3255   ins_encode %{
3256     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
3257   %}
3258   ins_pipe( pipe_slow );
3259 %}
3260 
3261 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
3262   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3263   match(Set dst (SubVL src1 src2));
3264   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
3265   ins_encode %{
3266     bool vector256 = false;
3267     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3268   %}
3269   ins_pipe( pipe_slow );
3270 %}
3271 
3272 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
3273   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3274   match(Set dst (SubVL src (LoadVector mem)));
3275   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
3276   ins_encode %{
3277     bool vector256 = false;
3278     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3279   %}
3280   ins_pipe( pipe_slow );
3281 %}
3282 
3283 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
3284   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
3285   match(Set dst (SubVL src1 src2));
3286   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
3287   ins_encode %{
3288     bool vector256 = true;
3289     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3290   %}
3291   ins_pipe( pipe_slow );
3292 %}
3293 
3294 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
3295   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
3296   match(Set dst (SubVL src (LoadVector mem)));
3297   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
3298   ins_encode %{
3299     bool vector256 = true;
3300     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3301   %}
3302   ins_pipe( pipe_slow );
3303 %}
3304 
3305 // Floats vector sub
3306 instruct vsub2F(vecD dst, vecD src) %{
3307   predicate(n->as_Vector()->length() == 2);
3308   match(Set dst (SubVF dst src));
3309   format %{ "subps   $dst,$src\t! sub packed2F" %}
3310   ins_encode %{
3311     __ subps($dst$$XMMRegister, $src$$XMMRegister);
3312   %}
3313   ins_pipe( pipe_slow );
3314 %}
3315 
3316 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
3317   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3318   match(Set dst (SubVF src1 src2));
3319   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
3320   ins_encode %{
3321     bool vector256 = false;
3322     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3323   %}
3324   ins_pipe( pipe_slow );
3325 %}
3326 
3327 instruct vsub4F(vecX dst, vecX src) %{
3328   predicate(n->as_Vector()->length() == 4);
3329   match(Set dst (SubVF dst src));
3330   format %{ "subps   $dst,$src\t! sub packed4F" %}
3331   ins_encode %{
3332     __ subps($dst$$XMMRegister, $src$$XMMRegister);
3333   %}
3334   ins_pipe( pipe_slow );
3335 %}
3336 
3337 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
3338   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3339   match(Set dst (SubVF src1 src2));
3340   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
3341   ins_encode %{
3342     bool vector256 = false;
3343     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3344   %}
3345   ins_pipe( pipe_slow );
3346 %}
3347 
3348 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
3349   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3350   match(Set dst (SubVF src (LoadVector mem)));
3351   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
3352   ins_encode %{
3353     bool vector256 = false;
3354     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3355   %}
3356   ins_pipe( pipe_slow );
3357 %}
3358 
3359 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
3360   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3361   match(Set dst (SubVF src1 src2));
3362   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
3363   ins_encode %{
3364     bool vector256 = true;
3365     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3366   %}
3367   ins_pipe( pipe_slow );
3368 %}
3369 
3370 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
3371   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3372   match(Set dst (SubVF src (LoadVector mem)));
3373   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
3374   ins_encode %{
3375     bool vector256 = true;
3376     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3377   %}
3378   ins_pipe( pipe_slow );
3379 %}
3380 
3381 // Doubles vector sub
3382 instruct vsub2D(vecX dst, vecX src) %{
3383   predicate(n->as_Vector()->length() == 2);
3384   match(Set dst (SubVD dst src));
3385   format %{ "subpd   $dst,$src\t! sub packed2D" %}
3386   ins_encode %{
3387     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
3388   %}
3389   ins_pipe( pipe_slow );
3390 %}
3391 
3392 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
3393   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3394   match(Set dst (SubVD src1 src2));
3395   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
3396   ins_encode %{
3397     bool vector256 = false;
3398     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3399   %}
3400   ins_pipe( pipe_slow );
3401 %}
3402 
3403 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
3404   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3405   match(Set dst (SubVD src (LoadVector mem)));
3406   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
3407   ins_encode %{
3408     bool vector256 = false;
3409     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3410   %}
3411   ins_pipe( pipe_slow );
3412 %}
3413 
3414 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
3415   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3416   match(Set dst (SubVD src1 src2));
3417   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
3418   ins_encode %{
3419     bool vector256 = true;
3420     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3421   %}
3422   ins_pipe( pipe_slow );
3423 %}
3424 
3425 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
3426   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3427   match(Set dst (SubVD src (LoadVector mem)));
3428   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
3429   ins_encode %{
3430     bool vector256 = true;
3431     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3432   %}
3433   ins_pipe( pipe_slow );
3434 %}
3435 
3436 // --------------------------------- MUL --------------------------------------
3437 
3438 // Shorts/Chars vector mul
3439 instruct vmul2S(vecS dst, vecS src) %{
3440   predicate(n->as_Vector()->length() == 2);
3441   match(Set dst (MulVS dst src));
3442   format %{ "pmullw $dst,$src\t! mul packed2S" %}
3443   ins_encode %{
3444     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
3445   %}
3446   ins_pipe( pipe_slow );
3447 %}
3448 
3449 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
3450   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3451   match(Set dst (MulVS src1 src2));
3452   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
3453   ins_encode %{
3454     bool vector256 = false;
3455     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3456   %}
3457   ins_pipe( pipe_slow );
3458 %}
3459 
3460 instruct vmul4S(vecD dst, vecD src) %{
3461   predicate(n->as_Vector()->length() == 4);
3462   match(Set dst (MulVS dst src));
3463   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
3464   ins_encode %{
3465     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
3466   %}
3467   ins_pipe( pipe_slow );
3468 %}
3469 
3470 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
3471   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3472   match(Set dst (MulVS src1 src2));
3473   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
3474   ins_encode %{
3475     bool vector256 = false;
3476     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3477   %}
3478   ins_pipe( pipe_slow );
3479 %}
3480 
3481 instruct vmul8S(vecX dst, vecX src) %{
3482   predicate(n->as_Vector()->length() == 8);
3483   match(Set dst (MulVS dst src));
3484   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
3485   ins_encode %{
3486     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
3487   %}
3488   ins_pipe( pipe_slow );
3489 %}
3490 
3491 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
3492   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3493   match(Set dst (MulVS src1 src2));
3494   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
3495   ins_encode %{
3496     bool vector256 = false;
3497     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3498   %}
3499   ins_pipe( pipe_slow );
3500 %}
3501 
3502 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
3503   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3504   match(Set dst (MulVS src (LoadVector mem)));
3505   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
3506   ins_encode %{
3507     bool vector256 = false;
3508     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3509   %}
3510   ins_pipe( pipe_slow );
3511 %}
3512 
3513 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
3514   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3515   match(Set dst (MulVS src1 src2));
3516   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
3517   ins_encode %{
3518     bool vector256 = true;
3519     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3520   %}
3521   ins_pipe( pipe_slow );
3522 %}
3523 
3524 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
3525   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
3526   match(Set dst (MulVS src (LoadVector mem)));
3527   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
3528   ins_encode %{
3529     bool vector256 = true;
3530     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3531   %}
3532   ins_pipe( pipe_slow );
3533 %}
3534 
3535 // Integers vector mul (sse4_1)
3536 instruct vmul2I(vecD dst, vecD src) %{
3537   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
3538   match(Set dst (MulVI dst src));
3539   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
3540   ins_encode %{
3541     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
3542   %}
3543   ins_pipe( pipe_slow );
3544 %}
3545 
3546 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
3547   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3548   match(Set dst (MulVI src1 src2));
3549   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
3550   ins_encode %{
3551     bool vector256 = false;
3552     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3553   %}
3554   ins_pipe( pipe_slow );
3555 %}
3556 
3557 instruct vmul4I(vecX dst, vecX src) %{
3558   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
3559   match(Set dst (MulVI dst src));
3560   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
3561   ins_encode %{
3562     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
3563   %}
3564   ins_pipe( pipe_slow );
3565 %}
3566 
3567 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
3568   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3569   match(Set dst (MulVI src1 src2));
3570   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
3571   ins_encode %{
3572     bool vector256 = false;
3573     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3574   %}
3575   ins_pipe( pipe_slow );
3576 %}
3577 
3578 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
3579   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3580   match(Set dst (MulVI src (LoadVector mem)));
3581   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
3582   ins_encode %{
3583     bool vector256 = false;
3584     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3585   %}
3586   ins_pipe( pipe_slow );
3587 %}
3588 
3589 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
3590   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3591   match(Set dst (MulVI src1 src2));
3592   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
3593   ins_encode %{
3594     bool vector256 = true;
3595     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3596   %}
3597   ins_pipe( pipe_slow );
3598 %}
3599 
3600 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
3601   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
3602   match(Set dst (MulVI src (LoadVector mem)));
3603   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
3604   ins_encode %{
3605     bool vector256 = true;
3606     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3607   %}
3608   ins_pipe( pipe_slow );
3609 %}
3610 
3611 // Floats vector mul
3612 instruct vmul2F(vecD dst, vecD src) %{
3613   predicate(n->as_Vector()->length() == 2);
3614   match(Set dst (MulVF dst src));
3615   format %{ "mulps   $dst,$src\t! mul packed2F" %}
3616   ins_encode %{
3617     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
3618   %}
3619   ins_pipe( pipe_slow );
3620 %}
3621 
3622 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
3623   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3624   match(Set dst (MulVF src1 src2));
3625   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
3626   ins_encode %{
3627     bool vector256 = false;
3628     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3629   %}
3630   ins_pipe( pipe_slow );
3631 %}
3632 
3633 instruct vmul4F(vecX dst, vecX src) %{
3634   predicate(n->as_Vector()->length() == 4);
3635   match(Set dst (MulVF dst src));
3636   format %{ "mulps   $dst,$src\t! mul packed4F" %}
3637   ins_encode %{
3638     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
3639   %}
3640   ins_pipe( pipe_slow );
3641 %}
3642 
3643 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
3644   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3645   match(Set dst (MulVF src1 src2));
3646   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
3647   ins_encode %{
3648     bool vector256 = false;
3649     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3650   %}
3651   ins_pipe( pipe_slow );
3652 %}
3653 
3654 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
3655   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3656   match(Set dst (MulVF src (LoadVector mem)));
3657   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
3658   ins_encode %{
3659     bool vector256 = false;
3660     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3661   %}
3662   ins_pipe( pipe_slow );
3663 %}
3664 
3665 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
3666   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3667   match(Set dst (MulVF src1 src2));
3668   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
3669   ins_encode %{
3670     bool vector256 = true;
3671     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3672   %}
3673   ins_pipe( pipe_slow );
3674 %}
3675 
3676 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
3677   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3678   match(Set dst (MulVF src (LoadVector mem)));
3679   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
3680   ins_encode %{
3681     bool vector256 = true;
3682     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3683   %}
3684   ins_pipe( pipe_slow );
3685 %}
3686 
3687 // Doubles vector mul
3688 instruct vmul2D(vecX dst, vecX src) %{
3689   predicate(n->as_Vector()->length() == 2);
3690   match(Set dst (MulVD dst src));
3691   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
3692   ins_encode %{
3693     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
3694   %}
3695   ins_pipe( pipe_slow );
3696 %}
3697 
3698 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
3699   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3700   match(Set dst (MulVD src1 src2));
3701   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
3702   ins_encode %{
3703     bool vector256 = false;
3704     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3705   %}
3706   ins_pipe( pipe_slow );
3707 %}
3708 
3709 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
3710   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3711   match(Set dst (MulVD src (LoadVector mem)));
3712   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
3713   ins_encode %{
3714     bool vector256 = false;
3715     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3716   %}
3717   ins_pipe( pipe_slow );
3718 %}
3719 
3720 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
3721   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3722   match(Set dst (MulVD src1 src2));
3723   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
3724   ins_encode %{
3725     bool vector256 = true;
3726     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3727   %}
3728   ins_pipe( pipe_slow );
3729 %}
3730 
3731 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
3732   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3733   match(Set dst (MulVD src (LoadVector mem)));
3734   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
3735   ins_encode %{
3736     bool vector256 = true;
3737     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3738   %}
3739   ins_pipe( pipe_slow );
3740 %}
3741 
3742 // --------------------------------- DIV --------------------------------------
3743 
3744 // Floats vector div
3745 instruct vdiv2F(vecD dst, vecD src) %{
3746   predicate(n->as_Vector()->length() == 2);
3747   match(Set dst (DivVF dst src));
3748   format %{ "divps   $dst,$src\t! div packed2F" %}
3749   ins_encode %{
3750     __ divps($dst$$XMMRegister, $src$$XMMRegister);
3751   %}
3752   ins_pipe( pipe_slow );
3753 %}
3754 
3755 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
3756   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3757   match(Set dst (DivVF src1 src2));
3758   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
3759   ins_encode %{
3760     bool vector256 = false;
3761     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3762   %}
3763   ins_pipe( pipe_slow );
3764 %}
3765 
3766 instruct vdiv4F(vecX dst, vecX src) %{
3767   predicate(n->as_Vector()->length() == 4);
3768   match(Set dst (DivVF dst src));
3769   format %{ "divps   $dst,$src\t! div packed4F" %}
3770   ins_encode %{
3771     __ divps($dst$$XMMRegister, $src$$XMMRegister);
3772   %}
3773   ins_pipe( pipe_slow );
3774 %}
3775 
3776 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
3777   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3778   match(Set dst (DivVF src1 src2));
3779   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
3780   ins_encode %{
3781     bool vector256 = false;
3782     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3783   %}
3784   ins_pipe( pipe_slow );
3785 %}
3786 
3787 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
3788   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3789   match(Set dst (DivVF src (LoadVector mem)));
3790   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
3791   ins_encode %{
3792     bool vector256 = false;
3793     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3794   %}
3795   ins_pipe( pipe_slow );
3796 %}
3797 
3798 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
3799   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3800   match(Set dst (DivVF src1 src2));
3801   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
3802   ins_encode %{
3803     bool vector256 = true;
3804     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3805   %}
3806   ins_pipe( pipe_slow );
3807 %}
3808 
3809 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
3810   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3811   match(Set dst (DivVF src (LoadVector mem)));
3812   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
3813   ins_encode %{
3814     bool vector256 = true;
3815     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3816   %}
3817   ins_pipe( pipe_slow );
3818 %}
3819 
3820 // Doubles vector div
3821 instruct vdiv2D(vecX dst, vecX src) %{
3822   predicate(n->as_Vector()->length() == 2);
3823   match(Set dst (DivVD dst src));
3824   format %{ "divpd   $dst,$src\t! div packed2D" %}
3825   ins_encode %{
3826     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
3827   %}
3828   ins_pipe( pipe_slow );
3829 %}
3830 
3831 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
3832   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3833   match(Set dst (DivVD src1 src2));
3834   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
3835   ins_encode %{
3836     bool vector256 = false;
3837     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3838   %}
3839   ins_pipe( pipe_slow );
3840 %}
3841 
3842 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
3843   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3844   match(Set dst (DivVD src (LoadVector mem)));
3845   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
3846   ins_encode %{
3847     bool vector256 = false;
3848     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3849   %}
3850   ins_pipe( pipe_slow );
3851 %}
3852 
3853 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
3854   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3855   match(Set dst (DivVD src1 src2));
3856   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
3857   ins_encode %{
3858     bool vector256 = true;
3859     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
3860   %}
3861   ins_pipe( pipe_slow );
3862 %}
3863 
3864 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
3865   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3866   match(Set dst (DivVD src (LoadVector mem)));
3867   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
3868   ins_encode %{
3869     bool vector256 = true;
3870     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
3871   %}
3872   ins_pipe( pipe_slow );
3873 %}
3874 
3875 // ------------------------------ Shift ---------------------------------------
3876 
3877 // Left and right shift count vectors are the same on x86
3878 // (only lowest bits of xmm reg are used for count).
3879 instruct vshiftcnt(vecS dst, rRegI cnt) %{
3880   match(Set dst (LShiftCntV cnt));
3881   match(Set dst (RShiftCntV cnt));
3882   format %{ "movd    $dst,$cnt\t! load shift count" %}
3883   ins_encode %{
3884     __ movdl($dst$$XMMRegister, $cnt$$Register);
3885   %}
3886   ins_pipe( pipe_slow );
3887 %}
3888 
3889 // ------------------------------ LeftShift -----------------------------------
3890 
3891 // Shorts/Chars vector left shift
3892 instruct vsll2S(vecS dst, vecS shift) %{
3893   predicate(n->as_Vector()->length() == 2);
3894   match(Set dst (LShiftVS dst shift));
3895   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
3896   ins_encode %{
3897     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
3898   %}
3899   ins_pipe( pipe_slow );
3900 %}
3901 
3902 instruct vsll2S_imm(vecS dst, immI8 shift) %{
3903   predicate(n->as_Vector()->length() == 2);
3904   match(Set dst (LShiftVS dst shift));
3905   format %{ "psllw   $dst,$shift\t! left shift packed2S" %}
3906   ins_encode %{
3907     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
3908   %}
3909   ins_pipe( pipe_slow );
3910 %}
3911 
3912 instruct vsll2S_reg(vecS dst, vecS src, vecS shift) %{
3913   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3914   match(Set dst (LShiftVS src shift));
3915   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
3916   ins_encode %{
3917     bool vector256 = false;
3918     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
3919   %}
3920   ins_pipe( pipe_slow );
3921 %}
3922 
3923 instruct vsll2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
3924   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
3925   match(Set dst (LShiftVS src shift));
3926   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed2S" %}
3927   ins_encode %{
3928     bool vector256 = false;
3929     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
3930   %}
3931   ins_pipe( pipe_slow );
3932 %}
3933 
3934 instruct vsll4S(vecD dst, vecS shift) %{
3935   predicate(n->as_Vector()->length() == 4);
3936   match(Set dst (LShiftVS dst shift));
3937   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
3938   ins_encode %{
3939     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
3940   %}
3941   ins_pipe( pipe_slow );
3942 %}
3943 
3944 instruct vsll4S_imm(vecD dst, immI8 shift) %{
3945   predicate(n->as_Vector()->length() == 4);
3946   match(Set dst (LShiftVS dst shift));
3947   format %{ "psllw   $dst,$shift\t! left shift packed4S" %}
3948   ins_encode %{
3949     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
3950   %}
3951   ins_pipe( pipe_slow );
3952 %}
3953 
3954 instruct vsll4S_reg(vecD dst, vecD src, vecS shift) %{
3955   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3956   match(Set dst (LShiftVS src shift));
3957   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
3958   ins_encode %{
3959     bool vector256 = false;
3960     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
3961   %}
3962   ins_pipe( pipe_slow );
3963 %}
3964 
3965 instruct vsll4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
3966   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
3967   match(Set dst (LShiftVS src shift));
3968   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed4S" %}
3969   ins_encode %{
3970     bool vector256 = false;
3971     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
3972   %}
3973   ins_pipe( pipe_slow );
3974 %}
3975 
3976 instruct vsll8S(vecX dst, vecS shift) %{
3977   predicate(n->as_Vector()->length() == 8);
3978   match(Set dst (LShiftVS dst shift));
3979   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
3980   ins_encode %{
3981     __ psllw($dst$$XMMRegister, $shift$$XMMRegister);
3982   %}
3983   ins_pipe( pipe_slow );
3984 %}
3985 
3986 instruct vsll8S_imm(vecX dst, immI8 shift) %{
3987   predicate(n->as_Vector()->length() == 8);
3988   match(Set dst (LShiftVS dst shift));
3989   format %{ "psllw   $dst,$shift\t! left shift packed8S" %}
3990   ins_encode %{
3991     __ psllw($dst$$XMMRegister, (int)$shift$$constant);
3992   %}
3993   ins_pipe( pipe_slow );
3994 %}
3995 
3996 instruct vsll8S_reg(vecX dst, vecX src, vecS shift) %{
3997   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
3998   match(Set dst (LShiftVS src shift));
3999   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
4000   ins_encode %{
4001     bool vector256 = false;
4002     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4003   %}
4004   ins_pipe( pipe_slow );
4005 %}
4006 
4007 instruct vsll8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
4008   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4009   match(Set dst (LShiftVS src shift));
4010   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed8S" %}
4011   ins_encode %{
4012     bool vector256 = false;
4013     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4014   %}
4015   ins_pipe( pipe_slow );
4016 %}
4017 
4018 instruct vsll16S_reg(vecY dst, vecY src, vecS shift) %{
4019   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4020   match(Set dst (LShiftVS src shift));
4021   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
4022   ins_encode %{
4023     bool vector256 = true;
4024     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4025   %}
4026   ins_pipe( pipe_slow );
4027 %}
4028 
4029 instruct vsll16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
4030   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4031   match(Set dst (LShiftVS src shift));
4032   format %{ "vpsllw  $dst,$src,$shift\t! left shift packed16S" %}
4033   ins_encode %{
4034     bool vector256 = true;
4035     __ vpsllw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4036   %}
4037   ins_pipe( pipe_slow );
4038 %}
4039 
4040 // Integers vector left shift
4041 instruct vsll2I(vecD dst, vecS shift) %{
4042   predicate(n->as_Vector()->length() == 2);
4043   match(Set dst (LShiftVI dst shift));
4044   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
4045   ins_encode %{
4046     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
4047   %}
4048   ins_pipe( pipe_slow );
4049 %}
4050 
4051 instruct vsll2I_imm(vecD dst, immI8 shift) %{
4052   predicate(n->as_Vector()->length() == 2);
4053   match(Set dst (LShiftVI dst shift));
4054   format %{ "pslld   $dst,$shift\t! left shift packed2I" %}
4055   ins_encode %{
4056     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
4057   %}
4058   ins_pipe( pipe_slow );
4059 %}
4060 
4061 instruct vsll2I_reg(vecD dst, vecD src, vecS shift) %{
4062   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4063   match(Set dst (LShiftVI src shift));
4064   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
4065   ins_encode %{
4066     bool vector256 = false;
4067     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4068   %}
4069   ins_pipe( pipe_slow );
4070 %}
4071 
4072 instruct vsll2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
4073   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4074   match(Set dst (LShiftVI src shift));
4075   format %{ "vpslld  $dst,$src,$shift\t! left shift packed2I" %}
4076   ins_encode %{
4077     bool vector256 = false;
4078     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4079   %}
4080   ins_pipe( pipe_slow );
4081 %}
4082 
4083 instruct vsll4I(vecX dst, vecS shift) %{
4084   predicate(n->as_Vector()->length() == 4);
4085   match(Set dst (LShiftVI dst shift));
4086   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
4087   ins_encode %{
4088     __ pslld($dst$$XMMRegister, $shift$$XMMRegister);
4089   %}
4090   ins_pipe( pipe_slow );
4091 %}
4092 
4093 instruct vsll4I_imm(vecX dst, immI8 shift) %{
4094   predicate(n->as_Vector()->length() == 4);
4095   match(Set dst (LShiftVI dst shift));
4096   format %{ "pslld   $dst,$shift\t! left shift packed4I" %}
4097   ins_encode %{
4098     __ pslld($dst$$XMMRegister, (int)$shift$$constant);
4099   %}
4100   ins_pipe( pipe_slow );
4101 %}
4102 
4103 instruct vsll4I_reg(vecX dst, vecX src, vecS shift) %{
4104   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4105   match(Set dst (LShiftVI src shift));
4106   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
4107   ins_encode %{
4108     bool vector256 = false;
4109     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4110   %}
4111   ins_pipe( pipe_slow );
4112 %}
4113 
4114 instruct vsll4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
4115   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4116   match(Set dst (LShiftVI src shift));
4117   format %{ "vpslld  $dst,$src,$shift\t! left shift packed4I" %}
4118   ins_encode %{
4119     bool vector256 = false;
4120     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4121   %}
4122   ins_pipe( pipe_slow );
4123 %}
4124 
4125 instruct vsll8I_reg(vecY dst, vecY src, vecS shift) %{
4126   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4127   match(Set dst (LShiftVI src shift));
4128   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
4129   ins_encode %{
4130     bool vector256 = true;
4131     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4132   %}
4133   ins_pipe( pipe_slow );
4134 %}
4135 
4136 instruct vsll8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
4137   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4138   match(Set dst (LShiftVI src shift));
4139   format %{ "vpslld  $dst,$src,$shift\t! left shift packed8I" %}
4140   ins_encode %{
4141     bool vector256 = true;
4142     __ vpslld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4143   %}
4144   ins_pipe( pipe_slow );
4145 %}
4146 
4147 // Longs vector left shift
4148 instruct vsll2L(vecX dst, vecS shift) %{
4149   predicate(n->as_Vector()->length() == 2);
4150   match(Set dst (LShiftVL dst shift));
4151   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
4152   ins_encode %{
4153     __ psllq($dst$$XMMRegister, $shift$$XMMRegister);
4154   %}
4155   ins_pipe( pipe_slow );
4156 %}
4157 
4158 instruct vsll2L_imm(vecX dst, immI8 shift) %{
4159   predicate(n->as_Vector()->length() == 2);
4160   match(Set dst (LShiftVL dst shift));
4161   format %{ "psllq   $dst,$shift\t! left shift packed2L" %}
4162   ins_encode %{
4163     __ psllq($dst$$XMMRegister, (int)$shift$$constant);
4164   %}
4165   ins_pipe( pipe_slow );
4166 %}
4167 
4168 instruct vsll2L_reg(vecX dst, vecX src, vecS shift) %{
4169   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4170   match(Set dst (LShiftVL src shift));
4171   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
4172   ins_encode %{
4173     bool vector256 = false;
4174     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4175   %}
4176   ins_pipe( pipe_slow );
4177 %}
4178 
4179 instruct vsll2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
4180   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4181   match(Set dst (LShiftVL src shift));
4182   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed2L" %}
4183   ins_encode %{
4184     bool vector256 = false;
4185     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4186   %}
4187   ins_pipe( pipe_slow );
4188 %}
4189 
4190 instruct vsll4L_reg(vecY dst, vecY src, vecS shift) %{
4191   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
4192   match(Set dst (LShiftVL src shift));
4193   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
4194   ins_encode %{
4195     bool vector256 = true;
4196     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4197   %}
4198   ins_pipe( pipe_slow );
4199 %}
4200 
4201 instruct vsll4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
4202   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
4203   match(Set dst (LShiftVL src shift));
4204   format %{ "vpsllq  $dst,$src,$shift\t! left shift packed4L" %}
4205   ins_encode %{
4206     bool vector256 = true;
4207     __ vpsllq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4208   %}
4209   ins_pipe( pipe_slow );
4210 %}
4211 
4212 // ----------------------- LogicalRightShift -----------------------------------
4213 
4214 // Shorts vector logical right shift produces incorrect Java result
4215 // for negative data because java code convert short value into int with
4216 // sign extension before a shift. But char vectors are fine since chars are
4217 // unsigned values.
4218 
4219 instruct vsrl2S(vecS dst, vecS shift) %{
4220   predicate(n->as_Vector()->length() == 2);
4221   match(Set dst (URShiftVS dst shift));
4222   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
4223   ins_encode %{
4224     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
4225   %}
4226   ins_pipe( pipe_slow );
4227 %}
4228 
4229 instruct vsrl2S_imm(vecS dst, immI8 shift) %{
4230   predicate(n->as_Vector()->length() == 2);
4231   match(Set dst (URShiftVS dst shift));
4232   format %{ "psrlw   $dst,$shift\t! logical right shift packed2S" %}
4233   ins_encode %{
4234     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
4235   %}
4236   ins_pipe( pipe_slow );
4237 %}
4238 
4239 instruct vsrl2S_reg(vecS dst, vecS src, vecS shift) %{
4240   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4241   match(Set dst (URShiftVS src shift));
4242   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
4243   ins_encode %{
4244     bool vector256 = false;
4245     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4246   %}
4247   ins_pipe( pipe_slow );
4248 %}
4249 
4250 instruct vsrl2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
4251   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4252   match(Set dst (URShiftVS src shift));
4253   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed2S" %}
4254   ins_encode %{
4255     bool vector256 = false;
4256     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4257   %}
4258   ins_pipe( pipe_slow );
4259 %}
4260 
4261 instruct vsrl4S(vecD dst, vecS shift) %{
4262   predicate(n->as_Vector()->length() == 4);
4263   match(Set dst (URShiftVS dst shift));
4264   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
4265   ins_encode %{
4266     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
4267   %}
4268   ins_pipe( pipe_slow );
4269 %}
4270 
4271 instruct vsrl4S_imm(vecD dst, immI8 shift) %{
4272   predicate(n->as_Vector()->length() == 4);
4273   match(Set dst (URShiftVS dst shift));
4274   format %{ "psrlw   $dst,$shift\t! logical right shift packed4S" %}
4275   ins_encode %{
4276     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
4277   %}
4278   ins_pipe( pipe_slow );
4279 %}
4280 
4281 instruct vsrl4S_reg(vecD dst, vecD src, vecS shift) %{
4282   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4283   match(Set dst (URShiftVS src shift));
4284   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
4285   ins_encode %{
4286     bool vector256 = false;
4287     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4288   %}
4289   ins_pipe( pipe_slow );
4290 %}
4291 
4292 instruct vsrl4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
4293   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4294   match(Set dst (URShiftVS src shift));
4295   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed4S" %}
4296   ins_encode %{
4297     bool vector256 = false;
4298     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4299   %}
4300   ins_pipe( pipe_slow );
4301 %}
4302 
4303 instruct vsrl8S(vecX dst, vecS shift) %{
4304   predicate(n->as_Vector()->length() == 8);
4305   match(Set dst (URShiftVS dst shift));
4306   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
4307   ins_encode %{
4308     __ psrlw($dst$$XMMRegister, $shift$$XMMRegister);
4309   %}
4310   ins_pipe( pipe_slow );
4311 %}
4312 
4313 instruct vsrl8S_imm(vecX dst, immI8 shift) %{
4314   predicate(n->as_Vector()->length() == 8);
4315   match(Set dst (URShiftVS dst shift));
4316   format %{ "psrlw   $dst,$shift\t! logical right shift packed8S" %}
4317   ins_encode %{
4318     __ psrlw($dst$$XMMRegister, (int)$shift$$constant);
4319   %}
4320   ins_pipe( pipe_slow );
4321 %}
4322 
4323 instruct vsrl8S_reg(vecX dst, vecX src, vecS shift) %{
4324   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4325   match(Set dst (URShiftVS src shift));
4326   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
4327   ins_encode %{
4328     bool vector256 = false;
4329     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4330   %}
4331   ins_pipe( pipe_slow );
4332 %}
4333 
4334 instruct vsrl8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
4335   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4336   match(Set dst (URShiftVS src shift));
4337   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed8S" %}
4338   ins_encode %{
4339     bool vector256 = false;
4340     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4341   %}
4342   ins_pipe( pipe_slow );
4343 %}
4344 
4345 instruct vsrl16S_reg(vecY dst, vecY src, vecS shift) %{
4346   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4347   match(Set dst (URShiftVS src shift));
4348   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
4349   ins_encode %{
4350     bool vector256 = true;
4351     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4352   %}
4353   ins_pipe( pipe_slow );
4354 %}
4355 
4356 instruct vsrl16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
4357   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4358   match(Set dst (URShiftVS src shift));
4359   format %{ "vpsrlw  $dst,$src,$shift\t! logical right shift packed16S" %}
4360   ins_encode %{
4361     bool vector256 = true;
4362     __ vpsrlw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4363   %}
4364   ins_pipe( pipe_slow );
4365 %}
4366 
4367 // Integers vector logical right shift
4368 instruct vsrl2I(vecD dst, vecS shift) %{
4369   predicate(n->as_Vector()->length() == 2);
4370   match(Set dst (URShiftVI dst shift));
4371   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
4372   ins_encode %{
4373     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
4374   %}
4375   ins_pipe( pipe_slow );
4376 %}
4377 
4378 instruct vsrl2I_imm(vecD dst, immI8 shift) %{
4379   predicate(n->as_Vector()->length() == 2);
4380   match(Set dst (URShiftVI dst shift));
4381   format %{ "psrld   $dst,$shift\t! logical right shift packed2I" %}
4382   ins_encode %{
4383     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
4384   %}
4385   ins_pipe( pipe_slow );
4386 %}
4387 
4388 instruct vsrl2I_reg(vecD dst, vecD src, vecS shift) %{
4389   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4390   match(Set dst (URShiftVI src shift));
4391   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
4392   ins_encode %{
4393     bool vector256 = false;
4394     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4395   %}
4396   ins_pipe( pipe_slow );
4397 %}
4398 
4399 instruct vsrl2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
4400   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4401   match(Set dst (URShiftVI src shift));
4402   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed2I" %}
4403   ins_encode %{
4404     bool vector256 = false;
4405     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4406   %}
4407   ins_pipe( pipe_slow );
4408 %}
4409 
4410 instruct vsrl4I(vecX dst, vecS shift) %{
4411   predicate(n->as_Vector()->length() == 4);
4412   match(Set dst (URShiftVI dst shift));
4413   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
4414   ins_encode %{
4415     __ psrld($dst$$XMMRegister, $shift$$XMMRegister);
4416   %}
4417   ins_pipe( pipe_slow );
4418 %}
4419 
4420 instruct vsrl4I_imm(vecX dst, immI8 shift) %{
4421   predicate(n->as_Vector()->length() == 4);
4422   match(Set dst (URShiftVI dst shift));
4423   format %{ "psrld   $dst,$shift\t! logical right shift packed4I" %}
4424   ins_encode %{
4425     __ psrld($dst$$XMMRegister, (int)$shift$$constant);
4426   %}
4427   ins_pipe( pipe_slow );
4428 %}
4429 
4430 instruct vsrl4I_reg(vecX dst, vecX src, vecS shift) %{
4431   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4432   match(Set dst (URShiftVI src shift));
4433   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
4434   ins_encode %{
4435     bool vector256 = false;
4436     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4437   %}
4438   ins_pipe( pipe_slow );
4439 %}
4440 
4441 instruct vsrl4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
4442   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4443   match(Set dst (URShiftVI src shift));
4444   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed4I" %}
4445   ins_encode %{
4446     bool vector256 = false;
4447     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4448   %}
4449   ins_pipe( pipe_slow );
4450 %}
4451 
4452 instruct vsrl8I_reg(vecY dst, vecY src, vecS shift) %{
4453   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4454   match(Set dst (URShiftVI src shift));
4455   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
4456   ins_encode %{
4457     bool vector256 = true;
4458     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4459   %}
4460   ins_pipe( pipe_slow );
4461 %}
4462 
4463 instruct vsrl8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
4464   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4465   match(Set dst (URShiftVI src shift));
4466   format %{ "vpsrld  $dst,$src,$shift\t! logical right shift packed8I" %}
4467   ins_encode %{
4468     bool vector256 = true;
4469     __ vpsrld($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4470   %}
4471   ins_pipe( pipe_slow );
4472 %}
4473 
4474 // Longs vector logical right shift
4475 instruct vsrl2L(vecX dst, vecS shift) %{
4476   predicate(n->as_Vector()->length() == 2);
4477   match(Set dst (URShiftVL dst shift));
4478   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
4479   ins_encode %{
4480     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
4481   %}
4482   ins_pipe( pipe_slow );
4483 %}
4484 
4485 instruct vsrl2L_imm(vecX dst, immI8 shift) %{
4486   predicate(n->as_Vector()->length() == 2);
4487   match(Set dst (URShiftVL dst shift));
4488   format %{ "psrlq   $dst,$shift\t! logical right shift packed2L" %}
4489   ins_encode %{
4490     __ psrlq($dst$$XMMRegister, (int)$shift$$constant);
4491   %}
4492   ins_pipe( pipe_slow );
4493 %}
4494 
4495 instruct vsrl2L_reg(vecX dst, vecX src, vecS shift) %{
4496   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4497   match(Set dst (URShiftVL src shift));
4498   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
4499   ins_encode %{
4500     bool vector256 = false;
4501     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4502   %}
4503   ins_pipe( pipe_slow );
4504 %}
4505 
4506 instruct vsrl2L_reg_imm(vecX dst, vecX src, immI8 shift) %{
4507   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4508   match(Set dst (URShiftVL src shift));
4509   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed2L" %}
4510   ins_encode %{
4511     bool vector256 = false;
4512     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4513   %}
4514   ins_pipe( pipe_slow );
4515 %}
4516 
4517 instruct vsrl4L_reg(vecY dst, vecY src, vecS shift) %{
4518   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
4519   match(Set dst (URShiftVL src shift));
4520   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
4521   ins_encode %{
4522     bool vector256 = true;
4523     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4524   %}
4525   ins_pipe( pipe_slow );
4526 %}
4527 
4528 instruct vsrl4L_reg_imm(vecY dst, vecY src, immI8 shift) %{
4529   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
4530   match(Set dst (URShiftVL src shift));
4531   format %{ "vpsrlq  $dst,$src,$shift\t! logical right shift packed4L" %}
4532   ins_encode %{
4533     bool vector256 = true;
4534     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4535   %}
4536   ins_pipe( pipe_slow );
4537 %}
4538 
4539 // ------------------- ArithmeticRightShift -----------------------------------
4540 
4541 // Shorts/Chars vector arithmetic right shift
4542 instruct vsra2S(vecS dst, vecS shift) %{
4543   predicate(n->as_Vector()->length() == 2);
4544   match(Set dst (RShiftVS dst shift));
4545   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
4546   ins_encode %{
4547     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
4548   %}
4549   ins_pipe( pipe_slow );
4550 %}
4551 
4552 instruct vsra2S_imm(vecS dst, immI8 shift) %{
4553   predicate(n->as_Vector()->length() == 2);
4554   match(Set dst (RShiftVS dst shift));
4555   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed2S" %}
4556   ins_encode %{
4557     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
4558   %}
4559   ins_pipe( pipe_slow );
4560 %}
4561 
4562 instruct vsra2S_reg(vecS dst, vecS src, vecS shift) %{
4563   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4564   match(Set dst (RShiftVS src shift));
4565   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
4566   ins_encode %{
4567     bool vector256 = false;
4568     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4569   %}
4570   ins_pipe( pipe_slow );
4571 %}
4572 
4573 instruct vsra2S_reg_imm(vecS dst, vecS src, immI8 shift) %{
4574   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4575   match(Set dst (RShiftVS src shift));
4576   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed2S" %}
4577   ins_encode %{
4578     bool vector256 = false;
4579     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4580   %}
4581   ins_pipe( pipe_slow );
4582 %}
4583 
4584 instruct vsra4S(vecD dst, vecS shift) %{
4585   predicate(n->as_Vector()->length() == 4);
4586   match(Set dst (RShiftVS dst shift));
4587   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
4588   ins_encode %{
4589     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
4590   %}
4591   ins_pipe( pipe_slow );
4592 %}
4593 
4594 instruct vsra4S_imm(vecD dst, immI8 shift) %{
4595   predicate(n->as_Vector()->length() == 4);
4596   match(Set dst (RShiftVS dst shift));
4597   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed4S" %}
4598   ins_encode %{
4599     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
4600   %}
4601   ins_pipe( pipe_slow );
4602 %}
4603 
4604 instruct vsra4S_reg(vecD dst, vecD src, vecS shift) %{
4605   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4606   match(Set dst (RShiftVS src shift));
4607   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
4608   ins_encode %{
4609     bool vector256 = false;
4610     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4611   %}
4612   ins_pipe( pipe_slow );
4613 %}
4614 
4615 instruct vsra4S_reg_imm(vecD dst, vecD src, immI8 shift) %{
4616   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4617   match(Set dst (RShiftVS src shift));
4618   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed4S" %}
4619   ins_encode %{
4620     bool vector256 = false;
4621     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4622   %}
4623   ins_pipe( pipe_slow );
4624 %}
4625 
4626 instruct vsra8S(vecX dst, vecS shift) %{
4627   predicate(n->as_Vector()->length() == 8);
4628   match(Set dst (RShiftVS dst shift));
4629   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
4630   ins_encode %{
4631     __ psraw($dst$$XMMRegister, $shift$$XMMRegister);
4632   %}
4633   ins_pipe( pipe_slow );
4634 %}
4635 
4636 instruct vsra8S_imm(vecX dst, immI8 shift) %{
4637   predicate(n->as_Vector()->length() == 8);
4638   match(Set dst (RShiftVS dst shift));
4639   format %{ "psraw   $dst,$shift\t! arithmetic right shift packed8S" %}
4640   ins_encode %{
4641     __ psraw($dst$$XMMRegister, (int)$shift$$constant);
4642   %}
4643   ins_pipe( pipe_slow );
4644 %}
4645 
4646 instruct vsra8S_reg(vecX dst, vecX src, vecS shift) %{
4647   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4648   match(Set dst (RShiftVS src shift));
4649   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
4650   ins_encode %{
4651     bool vector256 = false;
4652     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4653   %}
4654   ins_pipe( pipe_slow );
4655 %}
4656 
4657 instruct vsra8S_reg_imm(vecX dst, vecX src, immI8 shift) %{
4658   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
4659   match(Set dst (RShiftVS src shift));
4660   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed8S" %}
4661   ins_encode %{
4662     bool vector256 = false;
4663     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4664   %}
4665   ins_pipe( pipe_slow );
4666 %}
4667 
4668 instruct vsra16S_reg(vecY dst, vecY src, vecS shift) %{
4669   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4670   match(Set dst (RShiftVS src shift));
4671   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
4672   ins_encode %{
4673     bool vector256 = true;
4674     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4675   %}
4676   ins_pipe( pipe_slow );
4677 %}
4678 
4679 instruct vsra16S_reg_imm(vecY dst, vecY src, immI8 shift) %{
4680   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
4681   match(Set dst (RShiftVS src shift));
4682   format %{ "vpsraw  $dst,$src,$shift\t! arithmetic right shift packed16S" %}
4683   ins_encode %{
4684     bool vector256 = true;
4685     __ vpsraw($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4686   %}
4687   ins_pipe( pipe_slow );
4688 %}
4689 
4690 // Integers vector arithmetic right shift
4691 instruct vsra2I(vecD dst, vecS shift) %{
4692   predicate(n->as_Vector()->length() == 2);
4693   match(Set dst (RShiftVI dst shift));
4694   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
4695   ins_encode %{
4696     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
4697   %}
4698   ins_pipe( pipe_slow );
4699 %}
4700 
4701 instruct vsra2I_imm(vecD dst, immI8 shift) %{
4702   predicate(n->as_Vector()->length() == 2);
4703   match(Set dst (RShiftVI dst shift));
4704   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed2I" %}
4705   ins_encode %{
4706     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
4707   %}
4708   ins_pipe( pipe_slow );
4709 %}
4710 
4711 instruct vsra2I_reg(vecD dst, vecD src, vecS shift) %{
4712   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4713   match(Set dst (RShiftVI src shift));
4714   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
4715   ins_encode %{
4716     bool vector256 = false;
4717     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4718   %}
4719   ins_pipe( pipe_slow );
4720 %}
4721 
4722 instruct vsra2I_reg_imm(vecD dst, vecD src, immI8 shift) %{
4723   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
4724   match(Set dst (RShiftVI src shift));
4725   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed2I" %}
4726   ins_encode %{
4727     bool vector256 = false;
4728     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4729   %}
4730   ins_pipe( pipe_slow );
4731 %}
4732 
4733 instruct vsra4I(vecX dst, vecS shift) %{
4734   predicate(n->as_Vector()->length() == 4);
4735   match(Set dst (RShiftVI dst shift));
4736   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
4737   ins_encode %{
4738     __ psrad($dst$$XMMRegister, $shift$$XMMRegister);
4739   %}
4740   ins_pipe( pipe_slow );
4741 %}
4742 
4743 instruct vsra4I_imm(vecX dst, immI8 shift) %{
4744   predicate(n->as_Vector()->length() == 4);
4745   match(Set dst (RShiftVI dst shift));
4746   format %{ "psrad   $dst,$shift\t! arithmetic right shift packed4I" %}
4747   ins_encode %{
4748     __ psrad($dst$$XMMRegister, (int)$shift$$constant);
4749   %}
4750   ins_pipe( pipe_slow );
4751 %}
4752 
4753 instruct vsra4I_reg(vecX dst, vecX src, vecS shift) %{
4754   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4755   match(Set dst (RShiftVI src shift));
4756   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
4757   ins_encode %{
4758     bool vector256 = false;
4759     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4760   %}
4761   ins_pipe( pipe_slow );
4762 %}
4763 
4764 instruct vsra4I_reg_imm(vecX dst, vecX src, immI8 shift) %{
4765   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
4766   match(Set dst (RShiftVI src shift));
4767   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed4I" %}
4768   ins_encode %{
4769     bool vector256 = false;
4770     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4771   %}
4772   ins_pipe( pipe_slow );
4773 %}
4774 
4775 instruct vsra8I_reg(vecY dst, vecY src, vecS shift) %{
4776   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4777   match(Set dst (RShiftVI src shift));
4778   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
4779   ins_encode %{
4780     bool vector256 = true;
4781     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector256);
4782   %}
4783   ins_pipe( pipe_slow );
4784 %}
4785 
4786 instruct vsra8I_reg_imm(vecY dst, vecY src, immI8 shift) %{
4787   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
4788   match(Set dst (RShiftVI src shift));
4789   format %{ "vpsrad  $dst,$src,$shift\t! arithmetic right shift packed8I" %}
4790   ins_encode %{
4791     bool vector256 = true;
4792     __ vpsrad($dst$$XMMRegister, $src$$XMMRegister, (int)$shift$$constant, vector256);
4793   %}
4794   ins_pipe( pipe_slow );
4795 %}
4796 
4797 // There are no longs vector arithmetic right shift instructions.
4798 
4799 
4800 // --------------------------------- AND --------------------------------------
4801 
4802 instruct vand4B(vecS dst, vecS src) %{
4803   predicate(n->as_Vector()->length_in_bytes() == 4);
4804   match(Set dst (AndV dst src));
4805   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
4806   ins_encode %{
4807     __ pand($dst$$XMMRegister, $src$$XMMRegister);
4808   %}
4809   ins_pipe( pipe_slow );
4810 %}
4811 
4812 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
4813   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
4814   match(Set dst (AndV src1 src2));
4815   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
4816   ins_encode %{
4817     bool vector256 = false;
4818     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4819   %}
4820   ins_pipe( pipe_slow );
4821 %}
4822 
4823 instruct vand8B(vecD dst, vecD src) %{
4824   predicate(n->as_Vector()->length_in_bytes() == 8);
4825   match(Set dst (AndV dst src));
4826   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
4827   ins_encode %{
4828     __ pand($dst$$XMMRegister, $src$$XMMRegister);
4829   %}
4830   ins_pipe( pipe_slow );
4831 %}
4832 
4833 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
4834   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
4835   match(Set dst (AndV src1 src2));
4836   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
4837   ins_encode %{
4838     bool vector256 = false;
4839     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4840   %}
4841   ins_pipe( pipe_slow );
4842 %}
4843 
4844 instruct vand16B(vecX dst, vecX src) %{
4845   predicate(n->as_Vector()->length_in_bytes() == 16);
4846   match(Set dst (AndV dst src));
4847   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
4848   ins_encode %{
4849     __ pand($dst$$XMMRegister, $src$$XMMRegister);
4850   %}
4851   ins_pipe( pipe_slow );
4852 %}
4853 
4854 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
4855   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
4856   match(Set dst (AndV src1 src2));
4857   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
4858   ins_encode %{
4859     bool vector256 = false;
4860     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4861   %}
4862   ins_pipe( pipe_slow );
4863 %}
4864 
4865 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
4866   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
4867   match(Set dst (AndV src (LoadVector mem)));
4868   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
4869   ins_encode %{
4870     bool vector256 = false;
4871     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4872   %}
4873   ins_pipe( pipe_slow );
4874 %}
4875 
4876 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
4877   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
4878   match(Set dst (AndV src1 src2));
4879   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
4880   ins_encode %{
4881     bool vector256 = true;
4882     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4883   %}
4884   ins_pipe( pipe_slow );
4885 %}
4886 
4887 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
4888   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
4889   match(Set dst (AndV src (LoadVector mem)));
4890   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
4891   ins_encode %{
4892     bool vector256 = true;
4893     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4894   %}
4895   ins_pipe( pipe_slow );
4896 %}
4897 
4898 // --------------------------------- OR ---------------------------------------
4899 
4900 instruct vor4B(vecS dst, vecS src) %{
4901   predicate(n->as_Vector()->length_in_bytes() == 4);
4902   match(Set dst (OrV dst src));
4903   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
4904   ins_encode %{
4905     __ por($dst$$XMMRegister, $src$$XMMRegister);
4906   %}
4907   ins_pipe( pipe_slow );
4908 %}
4909 
4910 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
4911   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
4912   match(Set dst (OrV src1 src2));
4913   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
4914   ins_encode %{
4915     bool vector256 = false;
4916     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4917   %}
4918   ins_pipe( pipe_slow );
4919 %}
4920 
4921 instruct vor8B(vecD dst, vecD src) %{
4922   predicate(n->as_Vector()->length_in_bytes() == 8);
4923   match(Set dst (OrV dst src));
4924   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
4925   ins_encode %{
4926     __ por($dst$$XMMRegister, $src$$XMMRegister);
4927   %}
4928   ins_pipe( pipe_slow );
4929 %}
4930 
4931 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
4932   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
4933   match(Set dst (OrV src1 src2));
4934   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
4935   ins_encode %{
4936     bool vector256 = false;
4937     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4938   %}
4939   ins_pipe( pipe_slow );
4940 %}
4941 
4942 instruct vor16B(vecX dst, vecX src) %{
4943   predicate(n->as_Vector()->length_in_bytes() == 16);
4944   match(Set dst (OrV dst src));
4945   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
4946   ins_encode %{
4947     __ por($dst$$XMMRegister, $src$$XMMRegister);
4948   %}
4949   ins_pipe( pipe_slow );
4950 %}
4951 
4952 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
4953   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
4954   match(Set dst (OrV src1 src2));
4955   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
4956   ins_encode %{
4957     bool vector256 = false;
4958     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4959   %}
4960   ins_pipe( pipe_slow );
4961 %}
4962 
4963 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
4964   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
4965   match(Set dst (OrV src (LoadVector mem)));
4966   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
4967   ins_encode %{
4968     bool vector256 = false;
4969     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4970   %}
4971   ins_pipe( pipe_slow );
4972 %}
4973 
4974 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
4975   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
4976   match(Set dst (OrV src1 src2));
4977   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
4978   ins_encode %{
4979     bool vector256 = true;
4980     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
4981   %}
4982   ins_pipe( pipe_slow );
4983 %}
4984 
4985 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
4986   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
4987   match(Set dst (OrV src (LoadVector mem)));
4988   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
4989   ins_encode %{
4990     bool vector256 = true;
4991     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
4992   %}
4993   ins_pipe( pipe_slow );
4994 %}
4995 
4996 // --------------------------------- XOR --------------------------------------
4997 
4998 instruct vxor4B(vecS dst, vecS src) %{
4999   predicate(n->as_Vector()->length_in_bytes() == 4);
5000   match(Set dst (XorV dst src));
5001   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
5002   ins_encode %{
5003     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
5004   %}
5005   ins_pipe( pipe_slow );
5006 %}
5007 
5008 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
5009   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
5010   match(Set dst (XorV src1 src2));
5011   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
5012   ins_encode %{
5013     bool vector256 = false;
5014     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5015   %}
5016   ins_pipe( pipe_slow );
5017 %}
5018 
5019 instruct vxor8B(vecD dst, vecD src) %{
5020   predicate(n->as_Vector()->length_in_bytes() == 8);
5021   match(Set dst (XorV dst src));
5022   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
5023   ins_encode %{
5024     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
5025   %}
5026   ins_pipe( pipe_slow );
5027 %}
5028 
5029 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
5030   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
5031   match(Set dst (XorV src1 src2));
5032   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
5033   ins_encode %{
5034     bool vector256 = false;
5035     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5036   %}
5037   ins_pipe( pipe_slow );
5038 %}
5039 
5040 instruct vxor16B(vecX dst, vecX src) %{
5041   predicate(n->as_Vector()->length_in_bytes() == 16);
5042   match(Set dst (XorV dst src));
5043   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
5044   ins_encode %{
5045     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
5046   %}
5047   ins_pipe( pipe_slow );
5048 %}
5049 
5050 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
5051   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
5052   match(Set dst (XorV src1 src2));
5053   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
5054   ins_encode %{
5055     bool vector256 = false;
5056     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5057   %}
5058   ins_pipe( pipe_slow );
5059 %}
5060 
5061 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
5062   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
5063   match(Set dst (XorV src (LoadVector mem)));
5064   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
5065   ins_encode %{
5066     bool vector256 = false;
5067     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5068   %}
5069   ins_pipe( pipe_slow );
5070 %}
5071 
5072 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
5073   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5074   match(Set dst (XorV src1 src2));
5075   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
5076   ins_encode %{
5077     bool vector256 = true;
5078     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector256);
5079   %}
5080   ins_pipe( pipe_slow );
5081 %}
5082 
5083 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
5084   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
5085   match(Set dst (XorV src (LoadVector mem)));
5086   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
5087   ins_encode %{
5088     bool vector256 = true;
5089     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector256);
5090   %}
5091   ins_pipe( pipe_slow );
5092 %}
5093