amdgcn: Switch to HSACO v3 binary format

This upgrades the compiler to emit HSA Code Object v3 binaries.  This means
changing the assembler directives, and linker command line options.

The gcn-run and libgomp loaders need corresponding alterations.  The
relocations no longer need to be fixed up manually, and the kernel symbol
names have changed slightly.

This move makes the binaries compatible with the new rocgdb from ROCm 3.5.

2020-06-17  Andrew Stubbs  <ams@codesourcery.com>

	gcc/
	* config/gcn/gcn-hsa.h (TEXT_SECTION_ASM_OP): Use ".text".
	(BSS_SECTION_ASM_OP): Use ".bss".
	(ASM_SPEC): Remove "-mattr=-code-object-v3".
	(LINK_SPEC): Add "--export-dynamic".
	* config/gcn/gcn-opts.h (processor_type): Replace PROCESSOR_VEGA with
	PROCESSOR_VEGA10 and PROCESSOR_VEGA20.
	* config/gcn/gcn-run.c (HSA_RUNTIME_LIB): Use ".so.1" variant.
	(load_image): Remove obsolete relocation handling.
	Add ".kd" suffix to the symbol names.
	* config/gcn/gcn.c (MAX_NORMAL_SGPR_COUNT): Set to 62.
	(gcn_option_override): Update gcn_isa test.
	(gcn_kernel_arg_types): Update all the assembler directives.
	Remove the obsolete options.
	(gcn_conditional_register_usage): Update MAX_NORMAL_SGPR_COUNT usage.
	(gcn_omp_device_kind_arch_isa): Handle PROCESSOR_VEGA10 and
	PROCESSOR_VEGA20.
	(output_file_start): Rework assembler file header.
	(gcn_hsa_declare_function_name): Rework kernel metadata.
	* config/gcn/gcn.h (GCN_KERNEL_ARG_TYPES): Set to 16.
	* config/gcn/gcn.opt (PROCESSOR_VEGA): Remove enum.
	(PROCESSOR_VEGA10): New enum value.
	(PROCESSOR_VEGA20): New enum value.

	libgomp/
	* plugin/plugin-gcn.c (init_environment_variables): Use ".so.1"
	variant for HSA_RUNTIME_LIB name.
	(find_executable_symbol_1): Delete.
	(find_executable_symbol): Delete.
	(init_kernel_properties): Add ".kd" suffix to symbol names.
	(find_load_offset): Delete.
	(create_and_finalize_hsa_program): Remove relocation handling.
This commit is contained in:
Andrew Stubbs 2020-06-02 21:00:40 +01:00
parent 8ad4fc26dc
commit f062c3f115
7 changed files with 121 additions and 518 deletions

View file

@ -18,8 +18,8 @@
#error elf.h included before elfos.h
#endif
#define TEXT_SECTION_ASM_OP "\t.section\t.text"
#define BSS_SECTION_ASM_OP "\t.section\t.bss"
#define TEXT_SECTION_ASM_OP "\t.text"
#define BSS_SECTION_ASM_OP "\t.bss"
#define GLOBAL_ASM_OP "\t.globl\t"
#define DATA_SECTION_ASM_OP "\t.data\t"
#define SET_ASM_OP "\t.set\t"
@ -76,10 +76,10 @@ extern unsigned int gcn_local_sym_hash (const char *name);
#define GOMP_SELF_SPECS ""
/* Use LLVM assembler and linker options. */
#define ASM_SPEC "-triple=amdgcn--amdhsa -mattr=-code-object-v3 " \
#define ASM_SPEC "-triple=amdgcn--amdhsa " \
"%:last_arg(%{march=*:-mcpu=%*}) " \
"-filetype=obj"
#define LINK_SPEC "--pie"
#define LINK_SPEC "--pie --export-dynamic"
#define LIB_SPEC "-lc"
/* Provides a _start symbol to keep the linker happy. */

View file

@ -20,8 +20,9 @@
/* Which processor to generate code or schedule for. */
enum processor_type
{
PROCESSOR_FIJI,
PROCESSOR_VEGA
PROCESSOR_FIJI, // gfx803
PROCESSOR_VEGA10, // gfx900
PROCESSOR_VEGA20 // gfx906
};
/* Set in gcn_option_override. */

View file

@ -55,7 +55,7 @@
#include "hsa.h"
#ifndef HSA_RUNTIME_LIB
#define HSA_RUNTIME_LIB "libhsa-runtime64.so"
#define HSA_RUNTIME_LIB "libhsa-runtime64.so.1"
#endif
#ifndef VERSION_STRING
@ -429,20 +429,6 @@ load_image (const char *filename)
&executable),
"Initialize GCN executable");
/* Hide relocations from the HSA runtime loader.
Keep a copy of the unmodified section headers to use later. */
Elf64_Shdr *image_sections =
(Elf64_Shdr *) ((char *) image + image->e_shoff);
Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum);
memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum);
for (int i = image->e_shnum - 1; i >= 0; i--)
{
if (image_sections[i].sh_type == SHT_RELA
|| image_sections[i].sh_type == SHT_REL)
/* Change section type to something harmless. */
image_sections[i].sh_type = SHT_NOTE;
}
/* Add the HSACO to the executable. */
hsa_code_object_t co = { 0 };
XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co),
@ -457,23 +443,27 @@ load_image (const char *filename)
/* Locate the "_init_array" function, and read the kernel's properties. */
hsa_executable_symbol_t symbol;
XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_init_array",
device, 0, &symbol),
XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL,
"_init_array.kd", device, 0,
&symbol),
"Find '_init_array' function");
XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &init_array_kernel),
(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
&init_array_kernel),
"Extract '_init_array' kernel object kernel object");
/* Locate the "_fini_array" function, and read the kernel's properties. */
XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_fini_array",
device, 0, &symbol),
XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL,
"_fini_array.kd", device, 0,
&symbol),
"Find '_fini_array' function");
XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &fini_array_kernel),
(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
&fini_array_kernel),
"Extract '_fini_array' kernel object kernel object");
/* Locate the "main" function, and read the kernel's properties. */
XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main",
XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main.kd",
device, 0, &symbol),
"Find 'main' function");
XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
@ -491,126 +481,6 @@ load_image (const char *filename)
(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&private_segment_size),
"Extract private segment size");
/* Find main function in ELF, and calculate actual load offset. */
Elf64_Addr load_offset;
XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
(symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
&load_offset),
"Extract 'main' symbol address");
for (int i = 0; i < image->e_shnum; i++)
if (sections[i].sh_type == SHT_SYMTAB)
{
Elf64_Shdr *strtab = &sections[sections[i].sh_link];
char *strings = (char *) image + strtab->sh_offset;
for (size_t offset = 0;
offset < sections[i].sh_size;
offset += sections[i].sh_entsize)
{
Elf64_Sym *sym = (Elf64_Sym *) ((char *) image
+ sections[i].sh_offset + offset);
if (strcmp ("main", strings + sym->st_name) == 0)
{
load_offset -= sym->st_value;
goto found_main;
}
}
}
/* We only get here when main was not found.
This should never happen. */
fprintf (stderr, "Error: main function not found.\n");
abort ();
found_main:;
/* Find dynamic symbol table. */
Elf64_Shdr *dynsym = NULL;
for (int i = 0; i < image->e_shnum; i++)
if (sections[i].sh_type == SHT_DYNSYM)
{
dynsym = &sections[i];
break;
}
/* Fix up relocations. */
for (int i = 0; i < image->e_shnum; i++)
{
if (sections[i].sh_type == SHT_RELA)
for (size_t offset = 0;
offset < sections[i].sh_size;
offset += sections[i].sh_entsize)
{
Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image
+ sections[i].sh_offset
+ offset);
Elf64_Sym *sym =
(dynsym
? (Elf64_Sym *) ((char *) image
+ dynsym->sh_offset
+ (dynsym->sh_entsize
* ELF64_R_SYM (reloc->r_info))) : NULL);
int64_t S = (sym ? sym->st_value : 0);
int64_t P = reloc->r_offset + load_offset;
int64_t A = reloc->r_addend;
int64_t B = load_offset;
int64_t V, size;
switch (ELF64_R_TYPE (reloc->r_info))
{
case R_AMDGPU_ABS32_LO:
V = (S + A) & 0xFFFFFFFF;
size = 4;
break;
case R_AMDGPU_ABS32_HI:
V = (S + A) >> 32;
size = 4;
break;
case R_AMDGPU_ABS64:
V = S + A;
size = 8;
break;
case R_AMDGPU_REL32:
V = S + A - P;
size = 4;
break;
case R_AMDGPU_REL64:
/* FIXME
LLD seems to emit REL64 where the assembler has ABS64.
This is clearly wrong because it's not what the compiler
is expecting. Let's assume, for now, that it's a bug.
In any case, GCN kernels are always self contained and
therefore relative relocations will have been resolved
already, so this should be a safe workaround. */
V = S + A /* - P */ ;
size = 8;
break;
case R_AMDGPU_ABS32:
V = S + A;
size = 4;
break;
/* TODO R_AMDGPU_GOTPCREL */
/* TODO R_AMDGPU_GOTPCREL32_LO */
/* TODO R_AMDGPU_GOTPCREL32_HI */
case R_AMDGPU_REL32_LO:
V = (S + A - P) & 0xFFFFFFFF;
size = 4;
break;
case R_AMDGPU_REL32_HI:
V = (S + A - P) >> 32;
size = 4;
break;
case R_AMDGPU_RELATIVE64:
V = B + A;
size = 8;
break;
default:
fprintf (stderr, "Error: unsupported relocation type.\n");
exit (1);
}
XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size),
"Fix up relocation");
}
}
}
/* Allocate some device memory from the kernargs region.

View file

@ -83,7 +83,7 @@ int gcn_isa = 3; /* Default to GCN3. */
/* The number of registers usable by normal non-kernel functions.
The SGPR count includes any special extra registers such as VCC. */
#define MAX_NORMAL_SGPR_COUNT 64
#define MAX_NORMAL_SGPR_COUNT 62 // i.e. 64 with VCC
#define MAX_NORMAL_VGPR_COUNT 24
/* }}} */
@ -127,7 +127,7 @@ gcn_option_override (void)
if (!flag_pic)
flag_pic = flag_pie;
gcn_isa = gcn_arch == PROCESSOR_VEGA ? 5 : 3;
gcn_isa = gcn_arch == PROCESSOR_FIJI ? 3 : 5;
/* The default stack size needs to be small for offload kernels because
there may be many, many threads. Also, a smaller stack gives a
@ -168,37 +168,31 @@ static const struct gcn_kernel_arg_type
{"exec", NULL, DImode, EXEC_REG},
#define PRIVATE_SEGMENT_BUFFER_ARG 1
{"private_segment_buffer",
"enable_sgpr_private_segment_buffer", TImode, -1},
".amdhsa_user_sgpr_private_segment_buffer", TImode, -1},
#define DISPATCH_PTR_ARG 2
{"dispatch_ptr", "enable_sgpr_dispatch_ptr", DImode, -1},
{"dispatch_ptr", ".amdhsa_user_sgpr_dispatch_ptr", DImode, -1},
#define QUEUE_PTR_ARG 3
{"queue_ptr", "enable_sgpr_queue_ptr", DImode, -1},
{"queue_ptr", ".amdhsa_user_sgpr_queue_ptr", DImode, -1},
#define KERNARG_SEGMENT_PTR_ARG 4
{"kernarg_segment_ptr", "enable_sgpr_kernarg_segment_ptr", DImode, -1},
{"dispatch_id", "enable_sgpr_dispatch_id", DImode, -1},
{"kernarg_segment_ptr", ".amdhsa_user_sgpr_kernarg_segment_ptr", DImode, -1},
{"dispatch_id", ".amdhsa_user_sgpr_dispatch_id", DImode, -1},
#define FLAT_SCRATCH_INIT_ARG 6
{"flat_scratch_init", "enable_sgpr_flat_scratch_init", DImode, -1},
{"flat_scratch_init", ".amdhsa_user_sgpr_flat_scratch_init", DImode, -1},
#define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
{"private_segment_size", "enable_sgpr_private_segment_size", SImode, -1},
{"grid_workgroup_count_X",
"enable_sgpr_grid_workgroup_count_x", SImode, -1},
{"grid_workgroup_count_Y",
"enable_sgpr_grid_workgroup_count_y", SImode, -1},
{"grid_workgroup_count_Z",
"enable_sgpr_grid_workgroup_count_z", SImode, -1},
#define WORKGROUP_ID_X_ARG 11
{"workgroup_id_X", "enable_sgpr_workgroup_id_x", SImode, -2},
{"workgroup_id_Y", "enable_sgpr_workgroup_id_y", SImode, -2},
{"workgroup_id_Z", "enable_sgpr_workgroup_id_z", SImode, -2},
{"workgroup_info", "enable_sgpr_workgroup_info", SImode, -1},
#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 15
{"private_segment_size", ".amdhsa_user_sgpr_private_segment_size", SImode, -1},
#define WORKGROUP_ID_X_ARG 8
{"workgroup_id_X", ".amdhsa_system_sgpr_workgroup_id_x", SImode, -2},
{"workgroup_id_Y", ".amdhsa_system_sgpr_workgroup_id_y", SImode, -2},
{"workgroup_id_Z", ".amdhsa_system_sgpr_workgroup_id_z", SImode, -2},
{"workgroup_info", ".amdhsa_system_sgpr_workgroup_info", SImode, -1},
#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 12
{"private_segment_wave_offset",
"enable_sgpr_private_segment_wave_byte_offset", SImode, -2},
#define WORK_ITEM_ID_X_ARG 16
".amdhsa_system_sgpr_private_segment_wavefront_offset", SImode, -2},
#define WORK_ITEM_ID_X_ARG 13
{"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
#define WORK_ITEM_ID_Y_ARG 17
#define WORK_ITEM_ID_Y_ARG 14
{"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
#define WORK_ITEM_ID_Z_ARG 18
#define WORK_ITEM_ID_Z_ARG 15
{"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
};
@ -2075,7 +2069,7 @@ gcn_conditional_register_usage (void)
if (cfun->machine->normal_function)
{
/* Restrict the set of SGPRs and VGPRs used by non-kernel functions. */
for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT - 2);
for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT);
i <= LAST_SGPR_REG; i++)
fixed_regs[i] = 1, call_used_regs[i] = 1;
@ -2574,9 +2568,9 @@ gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
if (strcmp (name, "fiji") == 0)
return gcn_arch == PROCESSOR_FIJI;
if (strcmp (name, "gfx900") == 0)
return gcn_arch == PROCESSOR_VEGA;
return gcn_arch == PROCESSOR_VEGA10;
if (strcmp (name, "gfx906") == 0)
return gcn_arch == PROCESSOR_VEGA;
return gcn_arch == PROCESSOR_VEGA20;
return 0;
default:
gcc_unreachable ();
@ -4943,11 +4937,16 @@ gcn_fixup_accel_lto_options (tree fndecl)
static void
output_file_start (void)
{
fprintf (asm_out_file, "\t.text\n");
fprintf (asm_out_file, "\t.hsa_code_object_version 2,0\n");
fprintf (asm_out_file, "\t.hsa_code_object_isa\n"); /* Autodetect. */
fprintf (asm_out_file, "\t.section\t.AMDGPU.config\n");
fprintf (asm_out_file, "\t.text\n");
char *cpu;
switch (gcn_arch)
{
case PROCESSOR_FIJI: cpu = "gfx803"; break;
case PROCESSOR_VEGA10: cpu = "gfx900"; break;
case PROCESSOR_VEGA20: cpu = "gfx906"; break;
default: gcc_unreachable ();
}
fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s\"\n", cpu);
}
/* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
@ -4963,7 +4962,8 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
{
int sgpr, vgpr;
bool xnack_enabled = false;
int extra_regs = 0;
fputs ("\n\n", file);
if (cfun && cfun->machine && cfun->machine->normal_function)
{
@ -4986,76 +4986,20 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
break;
vgpr++;
if (xnack_enabled)
extra_regs = 6;
if (df_regs_ever_live_p (FLAT_SCRATCH_LO_REG)
|| df_regs_ever_live_p (FLAT_SCRATCH_HI_REG))
extra_regs = 4;
else if (df_regs_ever_live_p (VCC_LO_REG)
|| df_regs_ever_live_p (VCC_HI_REG))
extra_regs = 2;
if (!leaf_function_p ())
{
/* We can't know how many registers function calls might use. */
if (vgpr < MAX_NORMAL_VGPR_COUNT)
vgpr = MAX_NORMAL_VGPR_COUNT;
if (sgpr + extra_regs < MAX_NORMAL_SGPR_COUNT)
sgpr = MAX_NORMAL_SGPR_COUNT - extra_regs;
if (sgpr < MAX_NORMAL_SGPR_COUNT)
sgpr = MAX_NORMAL_SGPR_COUNT;
}
/* GFX8 allocates SGPRs in blocks of 8.
GFX9 uses blocks of 16. */
int granulated_sgprs;
if (TARGET_GCN3)
granulated_sgprs = (sgpr + extra_regs + 7) / 8 - 1;
else if (TARGET_GCN5)
granulated_sgprs = 2 * ((sgpr + extra_regs + 15) / 16 - 1);
else
gcc_unreachable ();
fputs ("\t.align\t256\n", file);
fputs ("\t.type\t", file);
assemble_name (file, name);
fputs (",@function\n\t.amdgpu_hsa_kernel\t", file);
fputs ("\t.rodata\n"
"\t.p2align\t6\n"
"\t.amdhsa_kernel\t", file);
assemble_name (file, name);
fputs ("\n", file);
assemble_name (file, name);
fputs (":\n", file);
fprintf (file, "\t.amd_kernel_code_t\n"
"\t\tkernel_code_version_major = 1\n"
"\t\tkernel_code_version_minor = 0\n" "\t\tmachine_kind = 1\n"
/* "\t\tmachine_version_major = 8\n"
"\t\tmachine_version_minor = 0\n"
"\t\tmachine_version_stepping = 1\n" */
"\t\tkernel_code_entry_byte_offset = 256\n"
"\t\tkernel_code_prefetch_byte_size = 0\n"
"\t\tmax_scratch_backing_memory_byte_size = 0\n"
"\t\tcompute_pgm_rsrc1_vgprs = %i\n"
"\t\tcompute_pgm_rsrc1_sgprs = %i\n"
"\t\tcompute_pgm_rsrc1_priority = 0\n"
"\t\tcompute_pgm_rsrc1_float_mode = 192\n"
"\t\tcompute_pgm_rsrc1_priv = 0\n"
"\t\tcompute_pgm_rsrc1_dx10_clamp = 1\n"
"\t\tcompute_pgm_rsrc1_debug_mode = 0\n"
"\t\tcompute_pgm_rsrc1_ieee_mode = 1\n"
/* We enable scratch memory. */
"\t\tcompute_pgm_rsrc2_scratch_en = 1\n"
"\t\tcompute_pgm_rsrc2_user_sgpr = %i\n"
"\t\tcompute_pgm_rsrc2_tgid_x_en = 1\n"
"\t\tcompute_pgm_rsrc2_tgid_y_en = 0\n"
"\t\tcompute_pgm_rsrc2_tgid_z_en = 0\n"
"\t\tcompute_pgm_rsrc2_tg_size_en = 0\n"
"\t\tcompute_pgm_rsrc2_tidig_comp_cnt = 0\n"
"\t\tcompute_pgm_rsrc2_excp_en_msb = 0\n"
"\t\tcompute_pgm_rsrc2_lds_size = 0\n" /* Set at runtime. */
"\t\tcompute_pgm_rsrc2_excp_en = 0\n",
(vgpr - 1) / 4,
/* Must match wavefront_sgpr_count */
granulated_sgprs,
/* The total number of SGPR user data registers requested. This
number must match the number of user data registers enabled. */
cfun->machine->args.nsgprs);
int reg = FIRST_SGPR_REG;
for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
{
@ -5073,7 +5017,8 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
if (gcn_kernel_arg_types[a].header_pseudo)
{
fprintf (file, "\t\t%s = %i",
fprintf (file, "\t %s%s\t%i",
(cfun->machine->args.requested & (1 << a)) != 0 ? "" : ";",
gcn_kernel_arg_types[a].header_pseudo,
(cfun->machine->args.requested & (1 << a)) != 0);
if (reg_first != -1)
@ -5091,54 +5036,71 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
}
else if (gcn_kernel_arg_types[a].fixed_regno >= 0
&& cfun->machine->args.requested & (1 << a))
fprintf (file, "\t\t; %s = %i (%s)\n",
fprintf (file, "\t ; %s\t%i (%s)\n",
gcn_kernel_arg_types[a].name,
(cfun->machine->args.requested & (1 << a)) != 0,
reg_names[gcn_kernel_arg_types[a].fixed_regno]);
}
fprintf (file, "\t\tenable_vgpr_workitem_id = %i\n",
fprintf (file, "\t .amdhsa_system_vgpr_workitem_id\t%i\n",
(cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG))
? 2
: cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
? 1 : 0);
fprintf (file, "\t\tenable_ordered_append_gds = 0\n"
"\t\tprivate_element_size = 1\n"
"\t\tis_ptr64 = 1\n"
"\t\tis_dynamic_callstack = 0\n"
"\t\tis_debug_enabled = 0\n"
"\t\tis_xnack_enabled = %i\n"
"\t\tworkitem_private_segment_byte_size = %i\n"
"\t\tworkgroup_group_segment_byte_size = %u\n"
"\t\tgds_segment_byte_size = 0\n"
"\t\tkernarg_segment_byte_size = %i\n"
"\t\tworkgroup_fbarrier_count = 0\n"
"\t\twavefront_sgpr_count = %i\n"
"\t\tworkitem_vgpr_count = %i\n"
"\t\treserved_vgpr_first = 0\n"
"\t\treserved_vgpr_count = 0\n"
"\t\treserved_sgpr_first = 0\n"
"\t\treserved_sgpr_count = 0\n"
"\t\tdebug_wavefront_private_segment_offset_sgpr = 0\n"
"\t\tdebug_private_segment_buffer_sgpr = 0\n"
"\t\tkernarg_segment_alignment = %i\n"
"\t\tgroup_segment_alignment = 4\n"
"\t\tprivate_segment_alignment = %i\n"
"\t\twavefront_size = 6\n"
"\t\tcall_convention = 0\n"
"\t\truntime_loader_kernel_symbol = 0\n"
"\t.end_amd_kernel_code_t\n", xnack_enabled,
fprintf (file,
"\t .amdhsa_next_free_vgpr\t%i\n"
"\t .amdhsa_next_free_sgpr\t%i\n"
"\t .amdhsa_reserve_vcc\t1\n"
"\t .amdhsa_reserve_flat_scratch\t0\n"
"\t .amdhsa_reserve_xnack_mask\t%i\n"
"\t .amdhsa_private_segment_fixed_size\t%i\n"
"\t .amdhsa_group_segment_fixed_size\t%u\n"
"\t .amdhsa_float_denorm_mode_32\t3\n"
"\t .amdhsa_float_denorm_mode_16_64\t3\n",
vgpr,
sgpr,
xnack_enabled,
/* workitem_private_segment_bytes_size needs to be
one 64th the wave-front stack size. */
stack_size_opt / 64,
LDS_SIZE, cfun->machine->kernarg_segment_byte_size,
/* Number of scalar registers used by a wavefront. This
includes the special SGPRs for VCC, Flat Scratch (Base,
Size) and XNACK (for GFX8 (VI)+). It does not include the
16 SGPR added if a trap handler is enabled. Must match
compute_pgm_rsrc1.sgprs. */
sgpr + extra_regs, vgpr,
LDS_SIZE);
fputs ("\t.end_amdhsa_kernel\n", file);
#if 1
/* The following is YAML embedded in assembler; tabs are not allowed. */
fputs (" .amdgpu_metadata\n"
" amdhsa.version:\n"
" - 1\n"
" - 0\n"
" amdhsa.kernels:\n"
" - .name: ", file);
assemble_name (file, name);
fputs ("\n .symbol: ", file);
assemble_name (file, name);
fprintf (file,
".kd\n"
" .kernarg_segment_size: %i\n"
" .kernarg_segment_align: %i\n"
" .group_segment_fixed_size: %u\n"
" .private_segment_fixed_size: %i\n"
" .wavefront_size: 64\n"
" .sgpr_count: %i\n"
" .vgpr_count: %i\n"
" .max_flat_workgroup_size: 1024\n",
cfun->machine->kernarg_segment_byte_size,
cfun->machine->kernarg_segment_alignment,
crtl->stack_alignment_needed / 8);
LDS_SIZE,
stack_size_opt / 64,
sgpr, vgpr);
fputs (" .end_amdgpu_metadata\n", file);
#endif
fputs ("\t.text\n", file);
fputs ("\t.align\t256\n", file);
fputs ("\t.type\t", file);
assemble_name (file, name);
fputs (",@function\n", file);
assemble_name (file, name);
fputs (":\n", file);
/* This comment is read by mkoffload. */
if (flag_openacc)
@ -5200,11 +5162,6 @@ gcn_target_asm_function_prologue (FILE *file)
asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
asm_fprintf (file, "\t; outgoing args size: %wd\n",
offsets->outgoing_args_size);
/* Enable denorms. */
asm_fprintf (file, "\n\t; Set MODE[FP_DENORM]: allow single and double"
" input and output denorms\n");
asm_fprintf (file, "\ts_setreg_imm32_b32\thwreg(1, 4, 4), 0xf\n\n");
}
}

View file

@ -525,7 +525,7 @@ enum gcn_address_spaces
#ifndef USED_FOR_TARGET
#define GCN_KERNEL_ARG_TYPES 19
#define GCN_KERNEL_ARG_TYPES 16
struct GTY(()) gcn_kernel_args
{
long requested;

View file

@ -29,10 +29,10 @@ EnumValue
Enum(gpu_type) String(fiji) Value(PROCESSOR_FIJI)
EnumValue
Enum(gpu_type) String(gfx900) Value(PROCESSOR_VEGA)
Enum(gpu_type) String(gfx900) Value(PROCESSOR_VEGA10)
EnumValue
Enum(gpu_type) String(gfx906) Value(PROCESSOR_VEGA)
Enum(gpu_type) String(gfx906) Value(PROCESSOR_VEGA20)
march=
Target RejectNegative Joined ToLower Enum(gpu_type) Var(gcn_arch) Init(PROCESSOR_FIJI)

View file

@ -1074,7 +1074,7 @@ init_environment_variables (void)
hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
if (hsa_runtime_lib == NULL)
hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so";
hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so.1";
support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
@ -1137,40 +1137,6 @@ get_executable_symbol_name (hsa_executable_symbol_t symbol)
return res;
}
/* Helper function for find_executable_symbol. */
static hsa_status_t
find_executable_symbol_1 (hsa_executable_t executable,
hsa_executable_symbol_t symbol,
void *data)
{
hsa_executable_symbol_t *res = (hsa_executable_symbol_t *)data;
*res = symbol;
return HSA_STATUS_INFO_BREAK;
}
/* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true. If not
found, return false. */
static bool
find_executable_symbol (hsa_executable_t executable,
hsa_executable_symbol_t *symbol)
{
hsa_status_t status;
status
= hsa_fns.hsa_executable_iterate_symbols_fn (executable,
find_executable_symbol_1,
symbol);
if (status != HSA_STATUS_INFO_BREAK)
{
hsa_error ("Could not find executable symbol", status);
return false;
}
return true;
}
/* Get the number of GPU Compute Units. */
static int
@ -2007,13 +1973,15 @@ init_kernel_properties (struct kernel_info *kernel)
hsa_status_t status;
struct agent_info *agent = kernel->agent;
hsa_executable_symbol_t kernel_symbol;
char *buf = alloca (strlen (kernel->name) + 4);
sprintf (buf, "%s.kd", kernel->name);
status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
kernel->name, agent->id,
buf, agent->id,
0, &kernel_symbol);
if (status != HSA_STATUS_SUCCESS)
{
hsa_warn ("Could not find symbol for kernel in the code object", status);
fprintf (stderr, "not found name: '%s'\n", kernel->name);
fprintf (stderr, "not found name: '%s'\n", buf);
dump_executable_symbols (agent->executable);
goto failure;
}
@ -2327,61 +2295,6 @@ init_basic_kernel_info (struct kernel_info *kernel,
return true;
}
/* Find the load_offset for MODULE, save to *LOAD_OFFSET, and return true. If
not found, return false. */
static bool
find_load_offset (Elf64_Addr *load_offset, struct agent_info *agent,
struct module_info *module, Elf64_Ehdr *image,
Elf64_Shdr *sections)
{
bool res = false;
hsa_status_t status;
hsa_executable_symbol_t symbol;
if (!find_executable_symbol (agent->executable, &symbol))
return false;
status = hsa_fns.hsa_executable_symbol_get_info_fn
(symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, load_offset);
if (status != HSA_STATUS_SUCCESS)
{
hsa_error ("Could not extract symbol address", status);
return false;
}
char *symbol_name = get_executable_symbol_name (symbol);
if (symbol_name == NULL)
return false;
/* Find the kernel function in ELF, and calculate actual load offset. */
for (int i = 0; i < image->e_shnum; i++)
if (sections[i].sh_type == SHT_SYMTAB)
{
Elf64_Shdr *strtab = &sections[sections[i].sh_link];
char *strings = (char *)image + strtab->sh_offset;
for (size_t offset = 0;
offset < sections[i].sh_size;
offset += sections[i].sh_entsize)
{
Elf64_Sym *sym = (Elf64_Sym*)((char*)image
+ sections[i].sh_offset
+ offset);
if (strcmp (symbol_name, strings + sym->st_name) == 0)
{
*load_offset -= sym->st_value;
res = true;
break;
}
}
}
free (symbol_name);
return res;
}
/* Check that the GCN ISA of the given image matches the ISA of the agent. */
static bool
@ -2421,7 +2334,6 @@ static bool
create_and_finalize_hsa_program (struct agent_info *agent)
{
hsa_status_t status;
int reloc_count = 0;
bool res = true;
if (pthread_mutex_lock (&agent->prog_mutex))
{
@ -2450,18 +2362,6 @@ create_and_finalize_hsa_program (struct agent_info *agent)
if (!isa_matches_agent (agent, image))
goto fail;
/* Hide relocations from the HSA runtime loader.
Keep a copy of the unmodified section headers to use later. */
Elf64_Shdr *image_sections = (Elf64_Shdr *)((char *)image
+ image->e_shoff);
for (int i = image->e_shnum - 1; i >= 0; i--)
{
if (image_sections[i].sh_type == SHT_RELA
|| image_sections[i].sh_type == SHT_REL)
/* Change section type to something harmless. */
image_sections[i].sh_type |= 0x80;
}
hsa_code_object_t co = { 0 };
status = hsa_fns.hsa_code_object_deserialize_fn
(module->image_desc->gcn_image->image,
@ -2517,131 +2417,6 @@ create_and_finalize_hsa_program (struct agent_info *agent)
goto fail;
}
if (agent->module)
{
struct module_info *module = agent->module;
Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
Elf64_Shdr *sections = (Elf64_Shdr *)((char *)image + image->e_shoff);
Elf64_Addr load_offset;
if (!find_load_offset (&load_offset, agent, module, image, sections))
goto fail;
/* Record the physical load address range.
We need this for data copies later. */
Elf64_Phdr *segments = (Elf64_Phdr *)((char*)image + image->e_phoff);
Elf64_Addr low = ~0, high = 0;
for (int i = 0; i < image->e_phnum; i++)
if (segments[i].p_memsz > 0)
{
if (segments[i].p_paddr < low)
low = segments[i].p_paddr;
if (segments[i].p_paddr > high)
high = segments[i].p_paddr + segments[i].p_memsz - 1;
}
module->phys_address_start = low + load_offset;
module->phys_address_end = high + load_offset;
// Find dynamic symbol table
Elf64_Shdr *dynsym = NULL;
for (int i = 0; i < image->e_shnum; i++)
if (sections[i].sh_type == SHT_DYNSYM)
{
dynsym = &sections[i];
break;
}
/* Fix up relocations. */
for (int i = 0; i < image->e_shnum; i++)
{
if (sections[i].sh_type == (SHT_RELA | 0x80))
for (size_t offset = 0;
offset < sections[i].sh_size;
offset += sections[i].sh_entsize)
{
Elf64_Rela *reloc = (Elf64_Rela*)((char*)image
+ sections[i].sh_offset
+ offset);
Elf64_Sym *sym =
(dynsym
? (Elf64_Sym*)((char*)image
+ dynsym->sh_offset
+ (dynsym->sh_entsize
* ELF64_R_SYM (reloc->r_info)))
: NULL);
int64_t S = (sym ? sym->st_value : 0);
int64_t P = reloc->r_offset + load_offset;
int64_t A = reloc->r_addend;
int64_t B = load_offset;
int64_t V, size;
switch (ELF64_R_TYPE (reloc->r_info))
{
case R_AMDGPU_ABS32_LO:
V = (S + A) & 0xFFFFFFFF;
size = 4;
break;
case R_AMDGPU_ABS32_HI:
V = (S + A) >> 32;
size = 4;
break;
case R_AMDGPU_ABS64:
V = S + A;
size = 8;
break;
case R_AMDGPU_REL32:
V = S + A - P;
size = 4;
break;
case R_AMDGPU_REL64:
/* FIXME
LLD seems to emit REL64 where the the assembler has
ABS64. This is clearly wrong because it's not what the
compiler is expecting. Let's assume, for now, that
it's a bug. In any case, GCN kernels are always self
contained and therefore relative relocations will have
been resolved already, so this should be a safe
workaround. */
V = S + A/* - P*/;
size = 8;
break;
case R_AMDGPU_ABS32:
V = S + A;
size = 4;
break;
/* TODO R_AMDGPU_GOTPCREL */
/* TODO R_AMDGPU_GOTPCREL32_LO */
/* TODO R_AMDGPU_GOTPCREL32_HI */
case R_AMDGPU_REL32_LO:
V = (S + A - P) & 0xFFFFFFFF;
size = 4;
break;
case R_AMDGPU_REL32_HI:
V = (S + A - P) >> 32;
size = 4;
break;
case R_AMDGPU_RELATIVE64:
V = B + A;
size = 8;
break;
default:
fprintf (stderr, "Error: unsupported relocation type.\n");
exit (1);
}
status = hsa_fns.hsa_memory_copy_fn ((void*)P, &V, size);
if (status != HSA_STATUS_SUCCESS)
{
hsa_error ("Failed to fix up relocation", status);
goto fail;
}
reloc_count++;
}
}
}
GCN_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n",
agent->device_id, reloc_count);
final:
agent->prog_finalized = true;