diff --git a/gcc/config/gcn/gcn-opts.h b/gcc/config/gcn/gcn-opts.h index 79fbda3ab25..6be2c9204fa 100644 --- a/gcc/config/gcn/gcn-opts.h +++ b/gcc/config/gcn/gcn-opts.h @@ -62,7 +62,7 @@ extern enum gcn_isa { #define TARGET_M0_LDS_LIMIT (TARGET_GCN3) -#define TARGET_PACKED_WORK_ITEMS (TARGET_CDNA2_PLUS) +#define TARGET_PACKED_WORK_ITEMS (TARGET_CDNA2_PLUS || TARGET_RDNA3) #define TARGET_XNACK (flag_xnack != HSACO_ATTR_OFF) diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md index 3d5b6271ee6..cd027f8b369 100644 --- a/gcc/config/gcn/gcn-valu.md +++ b/gcc/config/gcn/gcn-valu.md @@ -3555,30 +3555,63 @@ ;; }}} ;; {{{ Int/int conversions +(define_code_iterator all_convert [truncate zero_extend sign_extend]) (define_code_iterator zero_convert [truncate zero_extend]) (define_code_attr convop [ (sign_extend "extend") (zero_extend "zero_extend") (truncate "trunc")]) -(define_insn "2" +(define_expand "2" + [(set (match_operand:V_INT_1REG 0 "register_operand" "=v") + (all_convert:V_INT_1REG + (match_operand:V_INT_1REG_ALT 1 "gcn_alu_operand" " v")))] + "") + +(define_insn "*_sdwa" [(set (match_operand:V_INT_1REG 0 "register_operand" "=v") (zero_convert:V_INT_1REG (match_operand:V_INT_1REG_ALT 1 "gcn_alu_operand" " v")))] - "" + "!TARGET_RDNA3" "v_mov_b32_sdwa\t%0, %1 dst_sel: dst_unused:UNUSED_PAD src0_sel:" [(set_attr "type" "vop_sdwa") (set_attr "length" "8")]) -(define_insn "extend2" +(define_insn "extend_sdwa" [(set (match_operand:V_INT_1REG 0 "register_operand" "=v") (sign_extend:V_INT_1REG (match_operand:V_INT_1REG_ALT 1 "gcn_alu_operand" " v")))] - "" + "!TARGET_RDNA3" "v_mov_b32_sdwa\t%0, sext(%1) src0_sel:" [(set_attr "type" "vop_sdwa") (set_attr "length" "8")]) +(define_insn "*_shift" + [(set (match_operand:V_INT_1REG 0 "register_operand" "=v") + (all_convert:V_INT_1REG + (match_operand:V_INT_1REG_ALT 1 "gcn_alu_operand" " v")))] + "TARGET_RDNA3" + { + enum {extend, zero_extend, trunc}; + rtx shiftwidth = (mode == QImode + || mode == QImode + ? GEN_INT (24) + : mode == HImode + || mode == HImode + ? GEN_INT (16) + : NULL); + operands[2] = shiftwidth; + + if (!shiftwidth) + return "v_mov_b32 %0, %1"; + else if ( == extend || == trunc) + return "v_lshlrev_b32\t%0, %2, %1\;v_ashrrev_i32\t%0, %2, %0"; + else + return "v_lshlrev_b32\t%0, %2, %1\;v_lshrrev_b32\t%0, %2, %0"; + } + [(set_attr "type" "mult") + (set_attr "length" "8")]) + ;; GCC can already do these for scalar types, but not for vector types. ;; Unfortunately you can't just do SUBREG on a vector to select the low part, ;; so there must be a few tricks here. diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index e668ce7c69e..e80de2ce056 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -1597,8 +1597,8 @@ gcn_global_address_p (rtx addr) rtx offset = XEXP (addr, 1); int offsetbits = (TARGET_RDNA2_PLUS ? 11 : 12); bool immediate_p = (CONST_INT_P (offset) - && INTVAL (offset) >= -(1 << 12) - && INTVAL (offset) < (1 << 12)); + && INTVAL (offset) >= -(1 << offsetbits) + && INTVAL (offset) < (1 << offsetbits)); if ((gcn_address_register_p (base, DImode, false) || gcn_vec_address_register_p (base, DImode, false)) @@ -6597,8 +6597,10 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, if (df_regs_ever_live_p (FIRST_AVGPR_REG + avgpr)) break; avgpr++; - vgpr = (vgpr + 3) & ~3; - avgpr = (avgpr + 3) & ~3; + + /* The main function epilogue uses v8, but df doesn't see that. */ + if (vgpr < 9) + vgpr = 9; if (!leaf_function_p ()) { @@ -6611,9 +6613,18 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, avgpr = MAX_NORMAL_AVGPR_COUNT; } - /* The gfx90a accum_offset field can't represent 0 registers. */ - if (gcn_arch == PROCESSOR_GFX90a && vgpr < 4) - vgpr = 4; + /* SIMD32 devices count double in wavefront64 mode. */ + if (TARGET_RDNA2_PLUS) + vgpr *= 2; + + /* Round up to the allocation block size. */ + int vgpr_block_size = (TARGET_RDNA3 ? 12 + : TARGET_RDNA2_PLUS || TARGET_CDNA2_PLUS ? 8 + : 4); + if (vgpr % vgpr_block_size) + vgpr += vgpr_block_size - (vgpr % vgpr_block_size); + if (avgpr % vgpr_block_size) + avgpr += vgpr_block_size - (avgpr % vgpr_block_size); fputs ("\t.rodata\n" "\t.p2align\t6\n" @@ -6714,12 +6725,14 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, " .private_segment_fixed_size: 0\n" " .wavefront_size: 64\n" " .sgpr_count: %i\n" - " .vgpr_count: %i\n" + " .vgpr_count: %i%s\n" " .max_flat_workgroup_size: 1024\n", cfun->machine->kernarg_segment_byte_size, cfun->machine->kernarg_segment_alignment, LDS_SIZE, - sgpr, next_free_vgpr); + sgpr, next_free_vgpr, + (TARGET_RDNA2_PLUS ? " ; wavefrontsize64 counts double on SIMD32" + : "")); if (gcn_arch == PROCESSOR_GFX90a || gcn_arch == PROCESSOR_GFX908) fprintf (file, " .agpr_count: %i\n", avgpr); fputs (" .end_amdgpu_metadata\n", file); diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md index 492b833e255..1f3c692b7a6 100644 --- a/gcc/config/gcn/gcn.md +++ b/gcc/config/gcn/gcn.md @@ -1618,7 +1618,7 @@ (mult:SI (any_extend:SI (match_operand:HI 1 "register_operand" "%v")) (any_extend:SI (match_operand:HI 2 "register_operand" " v"))))] - "" + "!TARGET_RDNA3" "v_mul_32_24_sdwa\t%0, %1, %2 src0_sel:WORD_0 src1_sel:WORD_0" [(set_attr "type" "vop_sdwa") (set_attr "length" "8")]) @@ -1628,7 +1628,7 @@ (mult:HI (any_extend:HI (match_operand:QI 1 "register_operand" "%v")) (any_extend:HI (match_operand:QI 2 "register_operand" " v"))))] - "" + "!TARGET_RDNA3" "v_mul_32_24_sdwa\t%0, %1, %2 src0_sel:BYTE_0 src1_sel:BYTE_0" [(set_attr "type" "vop_sdwa") (set_attr "length" "8")]) diff --git a/libgcc/config/gcn/amdgcn_veclib.h b/libgcc/config/gcn/amdgcn_veclib.h index 821f6386dd6..d268c6cac16 100644 --- a/libgcc/config/gcn/amdgcn_veclib.h +++ b/libgcc/config/gcn/amdgcn_veclib.h @@ -230,7 +230,7 @@ do { \ #if defined (__GCN3__) || defined (__GCN5__) \ || defined (__CDNA1__) || defined (__CDNA2__) \ - || defined (__RDNA2__) + || defined (__RDNA2__) || defined (__RDNA3__) #define CDNA3_PLUS 0 #else #define CDNA3_PLUS 1 diff --git a/libgomp/config/gcn/time.c b/libgomp/config/gcn/time.c index 30a0d0188e4..efcd04f5f43 100644 --- a/libgomp/config/gcn/time.c +++ b/libgomp/config/gcn/time.c @@ -30,15 +30,25 @@ /* According to AMD: dGPU RTC is 27MHz AGPU RTC is 100MHz + RDNA3 ISA manual states "typically 100MHz" FIXME: DTRT on an APU. */ +#ifdef __RDNA3__ +#define RTC_TICKS (1.0 / 100000000.0) /* 100MHz */ +#else #define RTC_TICKS (1.0 / 27000000.0) /* 27MHz */ +#endif double omp_get_wtime (void) { uint64_t clock; +#ifdef __RDNA3__ + asm ("s_sendmsg_rtn_b64 %0 0x83 ;Get REALTIME\n\t" + "s_waitcnt 0" : "=r" (clock)); +#else asm ("s_memrealtime %0\n\t" "s_waitcnt 0" : "=r" (clock)); +#endif return clock * RTC_TICKS; } diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index 0339848451e..db28781dedb 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -1741,11 +1741,13 @@ max_isa_vgprs (int isa) case EF_AMDGPU_MACH_AMDGCN_GFX900: case EF_AMDGPU_MACH_AMDGCN_GFX906: case EF_AMDGPU_MACH_AMDGCN_GFX908: - case EF_AMDGPU_MACH_AMDGCN_GFX1030: - case EF_AMDGPU_MACH_AMDGCN_GFX1100: return 256; case EF_AMDGPU_MACH_AMDGCN_GFX90a: return 512; + case EF_AMDGPU_MACH_AMDGCN_GFX1030: + return 512; /* 512 SIMD32 = 256 wavefrontsize64. */ + case EF_AMDGPU_MACH_AMDGCN_GFX1100: + return 1536; /* 1536 SIMD32 = 768 wavefrontsize64. */ } GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs"); }