< prev index next >

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

Print this page
rev 60737 : 8252204: AArch64: Implement SHA3 accelerator/intrinsic
Reviewed-by: duke
Contributed-by: dongbo4@huawei.com


3274 
3275     if (multi_block) {
3276       __ add(ofs, ofs, 128);
3277       __ cmp(ofs, limit);
3278       __ br(Assembler::LE, sha512_loop);
3279       __ mov(c_rarg0, ofs); // return ofs
3280     }
3281 
3282     __ st1(v8, v9, v10, v11, __ T2D, state);
3283 
3284     __ ldpd(v14, v15, Address(sp, 48));
3285     __ ldpd(v12, v13, Address(sp, 32));
3286     __ ldpd(v10, v11, Address(sp, 16));
3287     __ ldpd(v8, v9, __ post(sp, 64));
3288 
3289     __ ret(lr);
3290 
3291     return start;
3292   }
3293 



























































































































































































































3294   // Safefetch stubs.
3295   void generate_safefetch(const char* name, int size, address* entry,
3296                           address* fault_pc, address* continuation_pc) {
3297     // safefetch signatures:
3298     //   int      SafeFetch32(int*      adr, int      errValue);
3299     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3300     //
3301     // arguments:
3302     //   c_rarg0 = adr
3303     //   c_rarg1 = errValue
3304     //
3305     // result:
3306     //   PPC_RET  = *adr or errValue
3307 
3308     StubCodeMark mark(this, "StubRoutines", name);
3309 
3310     // Entry point, pc or function descriptor.
3311     *entry = __ pc();
3312 
3313     // Load *adr into c_rarg1, may fault.


6004     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
6005 
6006     if (UseAESIntrinsics) {
6007       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
6008       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
6009       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
6010       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
6011     }
6012 
6013     if (UseSHA1Intrinsics) {
6014       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
6015       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
6016     }
6017     if (UseSHA256Intrinsics) {
6018       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
6019       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
6020     }
6021     if (UseSHA512Intrinsics) {
6022       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
6023       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");




6024     }
6025 
6026     // generate Adler32 intrinsics code
6027     if (UseAdler32Intrinsics) {
6028       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
6029     }
6030 
6031     StubRoutines::aarch64::set_completed();
6032   }
6033 
6034  public:
6035   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6036     if (all) {
6037       generate_all();
6038     } else {
6039       generate_initial();
6040     }
6041   }
6042 }; // end class declaration
6043 


3274 
3275     if (multi_block) {
3276       __ add(ofs, ofs, 128);
3277       __ cmp(ofs, limit);
3278       __ br(Assembler::LE, sha512_loop);
3279       __ mov(c_rarg0, ofs); // return ofs
3280     }
3281 
3282     __ st1(v8, v9, v10, v11, __ T2D, state);
3283 
3284     __ ldpd(v14, v15, Address(sp, 48));
3285     __ ldpd(v12, v13, Address(sp, 32));
3286     __ ldpd(v10, v11, Address(sp, 16));
3287     __ ldpd(v8, v9, __ post(sp, 64));
3288 
3289     __ ret(lr);
3290 
3291     return start;
3292   }
3293 
3294   // Arguments:
3295   //
3296   // Inputs:
3297   //   c_rarg0   - byte[]  source+offset
3298   //   c_rarg1   - byte[]   SHA.state
3299   //   c_rarg2   - int     digest_length
3300   //   c_rarg3   - int     offset
3301   //   c_rarg4   - int     limit
3302   //
3303   address generate_sha3_implCompress(bool multi_block, const char *name) {
3304     static const uint64_t round_consts[24] = {
3305       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
3306       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
3307       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
3308       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
3309       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
3310       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
3311       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
3312       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
3313     };
3314 
3315     __ align(CodeEntryAlignment);
3316     StubCodeMark mark(this, "StubRoutines", name);
3317     address start = __ pc();
3318 
3319     Register buf           = c_rarg0;
3320     Register state         = c_rarg1;
3321     Register digest_length = c_rarg2;
3322     Register ofs           = c_rarg3;
3323     Register limit         = c_rarg4;
3324 
3325     Label sha3_loop, rounds24_loop;
3326     Label sha3_512, sha3_384_or_224, sha3_256;
3327 
3328     __ stpd(v8, v9, __ pre(sp, -64));
3329     __ stpd(v10, v11, Address(sp, 16));
3330     __ stpd(v12, v13, Address(sp, 32));
3331     __ stpd(v14, v15, Address(sp, 48));
3332 
3333     // load state
3334     __ add(rscratch1, state, 32);
3335     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
3336     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
3337     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
3338     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
3339     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
3340     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
3341     __ ld1(v24, __ T1D, rscratch1);
3342 
3343     __ BIND(sha3_loop);
3344 
3345     // 24 keccak rounds
3346     __ movw(rscratch2, 24);
3347 
3348     // load round_constants base
3349     __ lea(rscratch1, ExternalAddress((address) round_consts));
3350 
3351     // load input
3352     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3353     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
3354     __ eor(v0, __ T8B, v0, v25);
3355     __ eor(v1, __ T8B, v1, v26);
3356     __ eor(v2, __ T8B, v2, v27);
3357     __ eor(v3, __ T8B, v3, v28);
3358     __ eor(v4, __ T8B, v4, v29);
3359     __ eor(v5, __ T8B, v5, v30);
3360     __ eor(v6, __ T8B, v6, v31);
3361 
3362     // digest_length == 64, SHA3-512
3363     __ tbnz(digest_length, 6, sha3_512);
3364 
3365     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3366     __ ld1(v29, v30, __ T8B, __ post(buf, 16));
3367     __ eor(v7, __ T8B, v7, v25);
3368     __ eor(v8, __ T8B, v8, v26);
3369     __ eor(v9, __ T8B, v9, v27);
3370     __ eor(v10, __ T8B, v10, v28);
3371     __ eor(v11, __ T8B, v11, v29);
3372     __ eor(v12, __ T8B, v12, v30);
3373 
3374     // digest_length == 28, SHA3-224;  digest_length == 48, SHA3-384
3375     __ tbnz(digest_length, 4, sha3_384_or_224);
3376 
3377     // SHA3-256
3378     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3379     __ eor(v13, __ T8B, v13, v25);
3380     __ eor(v14, __ T8B, v14, v26);
3381     __ eor(v15, __ T8B, v15, v27);
3382     __ eor(v16, __ T8B, v16, v28);
3383     __ b(rounds24_loop);
3384 
3385     __ BIND(sha3_384_or_224);
3386     __ tbz(digest_length, 2, rounds24_loop); // bit 2 cleared? SHA-384
3387 
3388     // SHA3-224
3389     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
3390     __ ld1(v29, __ T8B, __ post(buf, 8));
3391     __ eor(v13, __ T8B, v13, v25);
3392     __ eor(v14, __ T8B, v14, v26);
3393     __ eor(v15, __ T8B, v15, v27);
3394     __ eor(v16, __ T8B, v16, v28);
3395     __ eor(v17, __ T8B, v17, v29);
3396     __ b(rounds24_loop);
3397 
3398     __ BIND(sha3_512);
3399     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
3400     __ eor(v7, __ T8B, v7, v25);
3401     __ eor(v8, __ T8B, v8, v26);
3402 
3403     __ BIND(rounds24_loop);
3404     __ subw(rscratch2, rscratch2, 1);
3405 
3406     __ eor3(v29, __ T16B, v4, v9, v14);
3407     __ eor3(v26, __ T16B, v1, v6, v11);
3408     __ eor3(v28, __ T16B, v3, v8, v13);
3409     __ eor3(v25, __ T16B, v0, v5, v10);
3410     __ eor3(v27, __ T16B, v2, v7, v12);
3411     __ eor3(v29, __ T16B, v29, v19, v24);
3412     __ eor3(v26, __ T16B, v26, v16, v21);
3413     __ eor3(v28, __ T16B, v28, v18, v23);
3414     __ eor3(v25, __ T16B, v25, v15, v20);
3415     __ eor3(v27, __ T16B, v27, v17, v22);
3416 
3417     __ rax1(v30, __ T2D, v29, v26);
3418     __ rax1(v26, __ T2D, v26, v28);
3419     __ rax1(v28, __ T2D, v28, v25);
3420     __ rax1(v25, __ T2D, v25, v27);
3421     __ rax1(v27, __ T2D, v27, v29);
3422 
3423     __ eor(v0, __ T16B, v0, v30);
3424     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
3425     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
3426     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
3427     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
3428     __ xar(v22, __ T2D, v14, v28, (64 - 39));
3429     __ xar(v14, __ T2D, v20, v30, (64 - 18));
3430     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
3431     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
3432     __ xar(v12, __ T2D, v13, v27, (64 - 25));
3433     __ xar(v13, __ T2D, v19, v28, (64 - 8));
3434     __ xar(v19, __ T2D, v23, v27, (64 - 56));
3435     __ xar(v23, __ T2D, v15, v30, (64 - 41));
3436     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
3437     __ xar(v28, __ T2D, v24, v28, (64 - 14));
3438     __ xar(v24, __ T2D, v21, v25, (64 - 2));
3439     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
3440     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
3441     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
3442     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
3443     __ xar(v27, __ T2D, v18, v27, (64 - 21));
3444     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
3445     __ xar(v25, __ T2D, v11, v25, (64 - 10));
3446     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
3447     __ xar(v30, __ T2D, v10, v30, (64 - 3));
3448 
3449     __ bcax(v20, __ T16B, v31, v22, v8);
3450     __ bcax(v21, __ T16B, v8,  v23, v22);
3451     __ bcax(v22, __ T16B, v22, v24, v23);
3452     __ bcax(v23, __ T16B, v23, v31, v24);
3453     __ bcax(v24, __ T16B, v24, v8,  v31);
3454 
3455     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
3456 
3457     __ bcax(v17, __ T16B, v25, v19, v3);
3458     __ bcax(v18, __ T16B, v3,  v15, v19);
3459     __ bcax(v19, __ T16B, v19, v16, v15);
3460     __ bcax(v15, __ T16B, v15, v25, v16);
3461     __ bcax(v16, __ T16B, v16, v3,  v25);
3462 
3463     __ bcax(v10, __ T16B, v29, v12, v26);
3464     __ bcax(v11, __ T16B, v26, v13, v12);
3465     __ bcax(v12, __ T16B, v12, v14, v13);
3466     __ bcax(v13, __ T16B, v13, v29, v14);
3467     __ bcax(v14, __ T16B, v14, v26, v29);
3468 
3469     __ bcax(v7, __ T16B, v30, v9,  v4);
3470     __ bcax(v8, __ T16B, v4,  v5,  v9);
3471     __ bcax(v9, __ T16B, v9,  v6,  v5);
3472     __ bcax(v5, __ T16B, v5,  v30, v6);
3473     __ bcax(v6, __ T16B, v6,  v4,  v30);
3474 
3475     __ bcax(v3, __ T16B, v27, v0,  v28);
3476     __ bcax(v4, __ T16B, v28, v1,  v0);
3477     __ bcax(v0, __ T16B, v0,  v2,  v1);
3478     __ bcax(v1, __ T16B, v1,  v27, v2);
3479     __ bcax(v2, __ T16B, v2,  v28, v27);
3480 
3481     __ eor(v0, __ T16B, v0, v31);
3482 
3483     __ cbnzw(rscratch2, rounds24_loop);
3484 
3485     if (multi_block) {
3486       // block_size =  200 - 2 * digest_length, ofs += block_size
3487       __ add(ofs, ofs, 200);
3488       __ sub(ofs, ofs, digest_length, Assembler::LSL, 1);
3489 
3490       __ cmp(ofs, limit);
3491       __ br(Assembler::LE, sha3_loop);
3492       __ mov(c_rarg0, ofs); // return ofs
3493     }
3494 
3495     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
3496     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
3497     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
3498     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
3499     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
3500     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
3501     __ st1(v24, __ T1D, state);
3502 
3503     __ ldpd(v14, v15, Address(sp, 48));
3504     __ ldpd(v12, v13, Address(sp, 32));
3505     __ ldpd(v10, v11, Address(sp, 16));
3506     __ ldpd(v8, v9, __ post(sp, 64));
3507 
3508     __ ret(lr);
3509 
3510     return start;
3511   }
3512 
3513   // Safefetch stubs.
3514   void generate_safefetch(const char* name, int size, address* entry,
3515                           address* fault_pc, address* continuation_pc) {
3516     // safefetch signatures:
3517     //   int      SafeFetch32(int*      adr, int      errValue);
3518     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3519     //
3520     // arguments:
3521     //   c_rarg0 = adr
3522     //   c_rarg1 = errValue
3523     //
3524     // result:
3525     //   PPC_RET  = *adr or errValue
3526 
3527     StubCodeMark mark(this, "StubRoutines", name);
3528 
3529     // Entry point, pc or function descriptor.
3530     *entry = __ pc();
3531 
3532     // Load *adr into c_rarg1, may fault.


6223     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
6224 
6225     if (UseAESIntrinsics) {
6226       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
6227       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
6228       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
6229       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
6230     }
6231 
6232     if (UseSHA1Intrinsics) {
6233       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
6234       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
6235     }
6236     if (UseSHA256Intrinsics) {
6237       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
6238       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
6239     }
6240     if (UseSHA512Intrinsics) {
6241       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
6242       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
6243     }
6244     if (UseSHA3Intrinsics) {
6245       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
6246       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
6247     }
6248 
6249     // generate Adler32 intrinsics code
6250     if (UseAdler32Intrinsics) {
6251       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
6252     }
6253 
6254     StubRoutines::aarch64::set_completed();
6255   }
6256 
6257  public:
6258   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6259     if (all) {
6260       generate_all();
6261     } else {
6262       generate_initial();
6263     }
6264   }
6265 }; // end class declaration
6266 
< prev index next >