amdgcn: Switch to HSACO v3 binary format

This upgrades the compiler to emit HSA Code Object v3 binaries. This means changing the assembler directives, and linker command line options. The gcn-run and libgomp loaders need corresponding alterations. The relocations no longer need to be fixed up manually, and the kernel symbol names have changed slightly. This move makes the binaries compatible with the new rocgdb from ROCm 3.5. 2020-06-17 Andrew Stubbs <ams@codesourcery.com> gcc/ * config/gcn/gcn-hsa.h (TEXT_SECTION_ASM_OP): Use ".text". (BSS_SECTION_ASM_OP): Use ".bss". (ASM_SPEC): Remove "-mattr=-code-object-v3". (LINK_SPEC): Add "--export-dynamic". * config/gcn/gcn-opts.h (processor_type): Replace PROCESSOR_VEGA with PROCESSOR_VEGA10 and PROCESSOR_VEGA20. * config/gcn/gcn-run.c (HSA_RUNTIME_LIB): Use ".so.1" variant. (load_image): Remove obsolete relocation handling. Add ".kd" suffix to the symbol names. * config/gcn/gcn.c (MAX_NORMAL_SGPR_COUNT): Set to 62. (gcn_option_override): Update gcn_isa test. (gcn_kernel_arg_types): Update all the assembler directives. Remove the obsolete options. (gcn_conditional_register_usage): Update MAX_NORMAL_SGPR_COUNT usage. (gcn_omp_device_kind_arch_isa): Handle PROCESSOR_VEGA10 and PROCESSOR_VEGA20. (output_file_start): Rework assembler file header. (gcn_hsa_declare_function_name): Rework kernel metadata. * config/gcn/gcn.h (GCN_KERNEL_ARG_TYPES): Set to 16. * config/gcn/gcn.opt (PROCESSOR_VEGA): Remove enum. (PROCESSOR_VEGA10): New enum value. (PROCESSOR_VEGA20): New enum value. libgomp/ * plugin/plugin-gcn.c (init_environment_variables): Use ".so.1" variant for HSA_RUNTIME_LIB name. (find_executable_symbol_1): Delete. (find_executable_symbol): Delete. (init_kernel_properties): Add ".kd" suffix to symbol names. (find_load_offset): Delete. (create_and_finalize_hsa_program): Remove relocation handling.
2020-06-02 21:00:40 +01:00 · 2020-06-02 21:00:40 +01:00 · f062c3f115
commit f062c3f115
parent 8ad4fc26dc
7 changed files with 121 additions and 518 deletions
--- a/gcc/config/gcn/gcn-hsa.h
+++ b/gcc/config/gcn/gcn-hsa.h
@ -18,8 +18,8 @@
 #error elf.h included before elfos.h
 #endif

-#define TEXT_SECTION_ASM_OP "\t.section\t.text"
-#define BSS_SECTION_ASM_OP  "\t.section\t.bss"
+#define TEXT_SECTION_ASM_OP "\t.text"
+#define BSS_SECTION_ASM_OP  "\t.bss"
 #define GLOBAL_ASM_OP       "\t.globl\t"
 #define DATA_SECTION_ASM_OP "\t.data\t"
 #define SET_ASM_OP          "\t.set\t"
@ -76,10 +76,10 @@ extern unsigned int gcn_local_sym_hash (const char *name);
 #define GOMP_SELF_SPECS ""

 /* Use LLVM assembler and linker options.  */
-#define ASM_SPEC  "-triple=amdgcn--amdhsa -mattr=-code-object-v3 "  \
+#define ASM_SPEC  "-triple=amdgcn--amdhsa "  \
 		  "%:last_arg(%{march=*:-mcpu=%*}) " \
 		  "-filetype=obj"
-#define LINK_SPEC "--pie"
+#define LINK_SPEC "--pie --export-dynamic"
 #define LIB_SPEC  "-lc"

 /* Provides a _start symbol to keep the linker happy.  */
--- a/gcc/config/gcn/gcn-opts.h
+++ b/gcc/config/gcn/gcn-opts.h
@ -20,8 +20,9 @@
 /* Which processor to generate code or schedule for.  */
 enum processor_type
 {
-  PROCESSOR_FIJI,
-  PROCESSOR_VEGA
+  PROCESSOR_FIJI,    // gfx803
+  PROCESSOR_VEGA10,  // gfx900
+  PROCESSOR_VEGA20   // gfx906
 };

 /* Set in gcn_option_override.  */
--- a/gcc/config/gcn/gcn-run.c
+++ b/gcc/config/gcn/gcn-run.c
@ -55,7 +55,7 @@
 #include "hsa.h"

 #ifndef HSA_RUNTIME_LIB
-#define HSA_RUNTIME_LIB "libhsa-runtime64.so"
+#define HSA_RUNTIME_LIB "libhsa-runtime64.so.1"
 #endif

 #ifndef VERSION_STRING
@ -429,20 +429,6 @@ load_image (const char *filename)
 					  &executable),
 	"Initialize GCN executable");

-  /* Hide relocations from the HSA runtime loader.
-     Keep a copy of the unmodified section headers to use later.  */
-  Elf64_Shdr *image_sections =
-    (Elf64_Shdr *) ((char *) image + image->e_shoff);
-  Elf64_Shdr *sections = malloc (sizeof (Elf64_Shdr) * image->e_shnum);
-  memcpy (sections, image_sections, sizeof (Elf64_Shdr) * image->e_shnum);
-  for (int i = image->e_shnum - 1; i >= 0; i--)
-    {
-      if (image_sections[i].sh_type == SHT_RELA
-	  || image_sections[i].sh_type == SHT_REL)
-	/* Change section type to something harmless.  */
-	image_sections[i].sh_type = SHT_NOTE;
-    }
-
  /* Add the HSACO to the executable.  */
  hsa_code_object_t co = { 0 };
  XHSA (hsa_fns.hsa_code_object_deserialize_fn (image, image_size, NULL, &co),
@ -457,23 +443,27 @@ load_image (const char *filename)

  /* Locate the "_init_array" function, and read the kernel's properties.  */
  hsa_executable_symbol_t symbol;
-  XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_init_array",
-					      device, 0, &symbol),
+  XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, 
+					      "_init_array.kd", device, 0,
+					      &symbol),
 	"Find '_init_array' function");
  XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
-	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &init_array_kernel),
+	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
+	     &init_array_kernel),
 	"Extract '_init_array' kernel object kernel object");

  /* Locate the "_fini_array" function, and read the kernel's properties.  */
-  XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "_fini_array",
-					      device, 0, &symbol),
+  XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL,
+					      "_fini_array.kd", device, 0,
+					      &symbol),
 	"Find '_fini_array' function");
  XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
-	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &fini_array_kernel),
+	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
+	     &fini_array_kernel),
 	"Extract '_fini_array' kernel object kernel object");

  /* Locate the "main" function, and read the kernel's properties.  */
-  XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main",
+  XHSA (hsa_fns.hsa_executable_get_symbol_fn (executable, NULL, "main.kd",
 					      device, 0, &symbol),
 	"Find 'main' function");
  XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
@ -491,126 +481,6 @@ load_image (const char *filename)
 	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
 	     &private_segment_size),
 	"Extract private segment size");
-
-  /* Find main function in ELF, and calculate actual load offset.  */
-  Elf64_Addr load_offset;
-  XHSA (hsa_fns.hsa_executable_symbol_get_info_fn
-	    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
-	     &load_offset),
-	"Extract 'main' symbol address");
-  for (int i = 0; i < image->e_shnum; i++)
-    if (sections[i].sh_type == SHT_SYMTAB)
-      {
-	Elf64_Shdr *strtab = &sections[sections[i].sh_link];
-	char *strings = (char *) image + strtab->sh_offset;
-
-	for (size_t offset = 0;
-	     offset < sections[i].sh_size;
-	     offset += sections[i].sh_entsize)
-	  {
-	    Elf64_Sym *sym = (Elf64_Sym *) ((char *) image
-					    + sections[i].sh_offset + offset);
-	    if (strcmp ("main", strings + sym->st_name) == 0)
-	      {
-		load_offset -= sym->st_value;
-		goto found_main;
-	      }
-	  }
-      }
-  /* We only get here when main was not found.
-     This should never happen.  */
-  fprintf (stderr, "Error: main function not found.\n");
-  abort ();
-found_main:;
-
-  /* Find dynamic symbol table.  */
-  Elf64_Shdr *dynsym = NULL;
-  for (int i = 0; i < image->e_shnum; i++)
-    if (sections[i].sh_type == SHT_DYNSYM)
-      {
-	dynsym = &sections[i];
-	break;
-      }
-
-  /* Fix up relocations.  */
-  for (int i = 0; i < image->e_shnum; i++)
-    {
-      if (sections[i].sh_type == SHT_RELA)
-	for (size_t offset = 0;
-	     offset < sections[i].sh_size;
-	     offset += sections[i].sh_entsize)
-	  {
-	    Elf64_Rela *reloc = (Elf64_Rela *) ((char *) image
-						+ sections[i].sh_offset
-						+ offset);
-	    Elf64_Sym *sym =
-	      (dynsym
-	       ? (Elf64_Sym *) ((char *) image
-				+ dynsym->sh_offset
-				+ (dynsym->sh_entsize
-				   * ELF64_R_SYM (reloc->r_info))) : NULL);
-
-	    int64_t S = (sym ? sym->st_value : 0);
-	    int64_t P = reloc->r_offset + load_offset;
-	    int64_t A = reloc->r_addend;
-	    int64_t B = load_offset;
-	    int64_t V, size;
-	    switch (ELF64_R_TYPE (reloc->r_info))
-	      {
-	      case R_AMDGPU_ABS32_LO:
-		V = (S + A) & 0xFFFFFFFF;
-		size = 4;
-		break;
-	      case R_AMDGPU_ABS32_HI:
-		V = (S + A) >> 32;
-		size = 4;
-		break;
-	      case R_AMDGPU_ABS64:
-		V = S + A;
-		size = 8;
-		break;
-	      case R_AMDGPU_REL32:
-		V = S + A - P;
-		size = 4;
-		break;
-	      case R_AMDGPU_REL64:
-		/* FIXME
-		   LLD seems to emit REL64 where the assembler has ABS64.
-		   This is clearly wrong because it's not what the compiler
-		   is expecting.  Let's assume, for now, that it's a bug.
-		   In any case, GCN kernels are always self contained and
-		   therefore relative relocations will have been resolved
-		   already, so this should be a safe workaround.  */
-		V = S + A /* - P */ ;
-		size = 8;
-		break;
-	      case R_AMDGPU_ABS32:
-		V = S + A;
-		size = 4;
-		break;
-	      /* TODO R_AMDGPU_GOTPCREL */
-	      /* TODO R_AMDGPU_GOTPCREL32_LO */
-	      /* TODO R_AMDGPU_GOTPCREL32_HI */
-	      case R_AMDGPU_REL32_LO:
-		V = (S + A - P) & 0xFFFFFFFF;
-		size = 4;
-		break;
-	      case R_AMDGPU_REL32_HI:
-		V = (S + A - P) >> 32;
-		size = 4;
-		break;
-	      case R_AMDGPU_RELATIVE64:
-		V = B + A;
-		size = 8;
-		break;
-	      default:
-		fprintf (stderr, "Error: unsupported relocation type.\n");
-		exit (1);
-	      }
-	    XHSA (hsa_fns.hsa_memory_copy_fn ((void *) P, &V, size),
-		  "Fix up relocation");
-	  }
-    }
 }

 /* Allocate some device memory from the kernargs region.
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@ -83,7 +83,7 @@ int gcn_isa = 3;		/* Default to GCN3.  */
 /* The number of registers usable by normal non-kernel functions.
   The SGPR count includes any special extra registers such as VCC.  */

-#define MAX_NORMAL_SGPR_COUNT	64
+#define MAX_NORMAL_SGPR_COUNT	62  // i.e. 64 with VCC
 #define MAX_NORMAL_VGPR_COUNT	24

 /* }}}  */
@ -127,7 +127,7 @@ gcn_option_override (void)
  if (!flag_pic)
    flag_pic = flag_pie;

-  gcn_isa = gcn_arch == PROCESSOR_VEGA ? 5 : 3;
+  gcn_isa = gcn_arch == PROCESSOR_FIJI ? 3 : 5;

  /* The default stack size needs to be small for offload kernels because
     there may be many, many threads.  Also, a smaller stack gives a
@ -168,37 +168,31 @@ static const struct gcn_kernel_arg_type
  {"exec", NULL, DImode, EXEC_REG},
 #define PRIVATE_SEGMENT_BUFFER_ARG 1
  {"private_segment_buffer",
-    "enable_sgpr_private_segment_buffer", TImode, -1},
+    ".amdhsa_user_sgpr_private_segment_buffer", TImode, -1},
 #define DISPATCH_PTR_ARG 2
-  {"dispatch_ptr", "enable_sgpr_dispatch_ptr", DImode, -1},
+  {"dispatch_ptr", ".amdhsa_user_sgpr_dispatch_ptr", DImode, -1},
 #define QUEUE_PTR_ARG 3
-  {"queue_ptr", "enable_sgpr_queue_ptr", DImode, -1},
+  {"queue_ptr", ".amdhsa_user_sgpr_queue_ptr", DImode, -1},
 #define KERNARG_SEGMENT_PTR_ARG 4
-  {"kernarg_segment_ptr", "enable_sgpr_kernarg_segment_ptr", DImode, -1},
-  {"dispatch_id", "enable_sgpr_dispatch_id", DImode, -1},
+  {"kernarg_segment_ptr", ".amdhsa_user_sgpr_kernarg_segment_ptr", DImode, -1},
+  {"dispatch_id", ".amdhsa_user_sgpr_dispatch_id", DImode, -1},
 #define FLAT_SCRATCH_INIT_ARG 6
-  {"flat_scratch_init", "enable_sgpr_flat_scratch_init", DImode, -1},
+  {"flat_scratch_init", ".amdhsa_user_sgpr_flat_scratch_init", DImode, -1},
 #define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7
-  {"private_segment_size", "enable_sgpr_private_segment_size", SImode, -1},
-  {"grid_workgroup_count_X",
-    "enable_sgpr_grid_workgroup_count_x", SImode, -1},
-  {"grid_workgroup_count_Y",
-    "enable_sgpr_grid_workgroup_count_y", SImode, -1},
-  {"grid_workgroup_count_Z",
-    "enable_sgpr_grid_workgroup_count_z", SImode, -1},
-#define WORKGROUP_ID_X_ARG 11
-  {"workgroup_id_X", "enable_sgpr_workgroup_id_x", SImode, -2},
-  {"workgroup_id_Y", "enable_sgpr_workgroup_id_y", SImode, -2},
-  {"workgroup_id_Z", "enable_sgpr_workgroup_id_z", SImode, -2},
-  {"workgroup_info", "enable_sgpr_workgroup_info", SImode, -1},
-#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 15
+  {"private_segment_size", ".amdhsa_user_sgpr_private_segment_size", SImode, -1},
+#define WORKGROUP_ID_X_ARG 8
+  {"workgroup_id_X", ".amdhsa_system_sgpr_workgroup_id_x", SImode, -2},
+  {"workgroup_id_Y", ".amdhsa_system_sgpr_workgroup_id_y", SImode, -2},
+  {"workgroup_id_Z", ".amdhsa_system_sgpr_workgroup_id_z", SImode, -2},
+  {"workgroup_info", ".amdhsa_system_sgpr_workgroup_info", SImode, -1},
+#define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 12
  {"private_segment_wave_offset",
-    "enable_sgpr_private_segment_wave_byte_offset", SImode, -2},
-#define WORK_ITEM_ID_X_ARG 16
+    ".amdhsa_system_sgpr_private_segment_wavefront_offset", SImode, -2},
+#define WORK_ITEM_ID_X_ARG 13
  {"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG},
-#define WORK_ITEM_ID_Y_ARG 17
+#define WORK_ITEM_ID_Y_ARG 14
  {"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1},
-#define WORK_ITEM_ID_Z_ARG 18
+#define WORK_ITEM_ID_Z_ARG 15
  {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2}
 };

@ -2075,7 +2069,7 @@ gcn_conditional_register_usage (void)
  if (cfun->machine->normal_function)
    {
      /* Restrict the set of SGPRs and VGPRs used by non-kernel functions.  */
-      for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT - 2);
+      for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT);
 	   i <= LAST_SGPR_REG; i++)
 	fixed_regs[i] = 1, call_used_regs[i] = 1;

@ -2574,9 +2568,9 @@ gcn_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
      if (strcmp (name, "fiji") == 0)
 	return gcn_arch == PROCESSOR_FIJI;
      if (strcmp (name, "gfx900") == 0)
-	return gcn_arch == PROCESSOR_VEGA;
+	return gcn_arch == PROCESSOR_VEGA10;
      if (strcmp (name, "gfx906") == 0)
-	return gcn_arch == PROCESSOR_VEGA;
+	return gcn_arch == PROCESSOR_VEGA20;
      return 0;
    default:
      gcc_unreachable ();
@ -4943,11 +4937,16 @@ gcn_fixup_accel_lto_options (tree fndecl)
 static void
 output_file_start (void)
 {
-  fprintf (asm_out_file, "\t.text\n");
-  fprintf (asm_out_file, "\t.hsa_code_object_version 2,0\n");
-  fprintf (asm_out_file, "\t.hsa_code_object_isa\n");	/* Autodetect.  */
-  fprintf (asm_out_file, "\t.section\t.AMDGPU.config\n");
-  fprintf (asm_out_file, "\t.text\n");
+  char *cpu;
+  switch (gcn_arch)
+    {
+    case PROCESSOR_FIJI: cpu = "gfx803"; break;
+    case PROCESSOR_VEGA10: cpu = "gfx900"; break;
+    case PROCESSOR_VEGA20: cpu = "gfx906"; break;
+    default: gcc_unreachable ();
+    }
+
+  fprintf(asm_out_file, "\t.amdgcn_target \"amdgcn-unknown-amdhsa--%s\"\n", cpu);
 }

 /* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h.
@ -4963,7 +4962,8 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
 {
  int sgpr, vgpr;
  bool xnack_enabled = false;
-  int extra_regs = 0;
+
+  fputs ("\n\n", file);

  if (cfun && cfun->machine && cfun->machine->normal_function)
    {
@ -4986,76 +4986,20 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
      break;
  vgpr++;

-  if (xnack_enabled)
-    extra_regs = 6;
-  if (df_regs_ever_live_p (FLAT_SCRATCH_LO_REG)
-      || df_regs_ever_live_p (FLAT_SCRATCH_HI_REG))
-    extra_regs = 4;
-  else if (df_regs_ever_live_p (VCC_LO_REG)
-	   || df_regs_ever_live_p (VCC_HI_REG))
-    extra_regs = 2;
-
  if (!leaf_function_p ())
    {
      /* We can't know how many registers function calls might use.  */
      if (vgpr < MAX_NORMAL_VGPR_COUNT)
 	vgpr = MAX_NORMAL_VGPR_COUNT;
-      if (sgpr + extra_regs < MAX_NORMAL_SGPR_COUNT)
-	sgpr = MAX_NORMAL_SGPR_COUNT - extra_regs;
+      if (sgpr < MAX_NORMAL_SGPR_COUNT)
+	sgpr = MAX_NORMAL_SGPR_COUNT;
    }

-  /* GFX8 allocates SGPRs in blocks of 8.
-     GFX9 uses blocks of 16.  */
-  int granulated_sgprs;
-  if (TARGET_GCN3)
-    granulated_sgprs = (sgpr + extra_regs + 7) / 8 - 1;
-  else if (TARGET_GCN5)
-    granulated_sgprs = 2 * ((sgpr + extra_regs + 15) / 16 - 1);
-  else
-    gcc_unreachable ();
-
-  fputs ("\t.align\t256\n", file);
-  fputs ("\t.type\t", file);
-  assemble_name (file, name);
-  fputs (",@function\n\t.amdgpu_hsa_kernel\t", file);
+  fputs ("\t.rodata\n"
+	 "\t.p2align\t6\n"
+	 "\t.amdhsa_kernel\t", file);
  assemble_name (file, name);
  fputs ("\n", file);
-  assemble_name (file, name);
-  fputs (":\n", file);
-  fprintf (file, "\t.amd_kernel_code_t\n"
-	   "\t\tkernel_code_version_major = 1\n"
-	   "\t\tkernel_code_version_minor = 0\n" "\t\tmachine_kind = 1\n"
-	   /* "\t\tmachine_version_major = 8\n"
-	      "\t\tmachine_version_minor = 0\n"
-	      "\t\tmachine_version_stepping = 1\n" */
-	   "\t\tkernel_code_entry_byte_offset = 256\n"
-	   "\t\tkernel_code_prefetch_byte_size = 0\n"
-	   "\t\tmax_scratch_backing_memory_byte_size = 0\n"
-	   "\t\tcompute_pgm_rsrc1_vgprs = %i\n"
-	   "\t\tcompute_pgm_rsrc1_sgprs = %i\n"
-	   "\t\tcompute_pgm_rsrc1_priority = 0\n"
-	   "\t\tcompute_pgm_rsrc1_float_mode = 192\n"
-	   "\t\tcompute_pgm_rsrc1_priv = 0\n"
-	   "\t\tcompute_pgm_rsrc1_dx10_clamp = 1\n"
-	   "\t\tcompute_pgm_rsrc1_debug_mode = 0\n"
-	   "\t\tcompute_pgm_rsrc1_ieee_mode = 1\n"
-	   /* We enable scratch memory.  */
-	   "\t\tcompute_pgm_rsrc2_scratch_en = 1\n"
-	   "\t\tcompute_pgm_rsrc2_user_sgpr = %i\n"
-	   "\t\tcompute_pgm_rsrc2_tgid_x_en = 1\n"
-	   "\t\tcompute_pgm_rsrc2_tgid_y_en = 0\n"
-	   "\t\tcompute_pgm_rsrc2_tgid_z_en = 0\n"
-	   "\t\tcompute_pgm_rsrc2_tg_size_en = 0\n"
-	   "\t\tcompute_pgm_rsrc2_tidig_comp_cnt = 0\n"
-	   "\t\tcompute_pgm_rsrc2_excp_en_msb = 0\n"
-	   "\t\tcompute_pgm_rsrc2_lds_size = 0\n"	/* Set at runtime.  */
-	   "\t\tcompute_pgm_rsrc2_excp_en = 0\n",
-	   (vgpr - 1) / 4,
-	   /* Must match wavefront_sgpr_count */
-	   granulated_sgprs,
-	   /* The total number of SGPR user data registers requested.  This
-	      number must match the number of user data registers enabled.  */
-	   cfun->machine->args.nsgprs);
  int reg = FIRST_SGPR_REG;
  for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++)
    {
@ -5073,7 +5017,8 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)

      if (gcn_kernel_arg_types[a].header_pseudo)
 	{
-	  fprintf (file, "\t\t%s = %i",
+	  fprintf (file, "\t  %s%s\t%i",
+		   (cfun->machine->args.requested & (1 << a)) != 0 ? "" : ";",
 		   gcn_kernel_arg_types[a].header_pseudo,
 		   (cfun->machine->args.requested & (1 << a)) != 0);
 	  if (reg_first != -1)
@ -5091,54 +5036,71 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
 	}
      else if (gcn_kernel_arg_types[a].fixed_regno >= 0
 	       && cfun->machine->args.requested & (1 << a))
-	fprintf (file, "\t\t; %s = %i (%s)\n",
+	fprintf (file, "\t  ; %s\t%i (%s)\n",
 		 gcn_kernel_arg_types[a].name,
 		 (cfun->machine->args.requested & (1 << a)) != 0,
 		 reg_names[gcn_kernel_arg_types[a].fixed_regno]);
    }
-  fprintf (file, "\t\tenable_vgpr_workitem_id = %i\n",
+  fprintf (file, "\t  .amdhsa_system_vgpr_workitem_id\t%i\n",
 	   (cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG))
 	   ? 2
 	   : cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
 	   ? 1 : 0);
-  fprintf (file, "\t\tenable_ordered_append_gds = 0\n"
-	   "\t\tprivate_element_size = 1\n"
-	   "\t\tis_ptr64 = 1\n"
-	   "\t\tis_dynamic_callstack = 0\n"
-	   "\t\tis_debug_enabled = 0\n"
-	   "\t\tis_xnack_enabled = %i\n"
-	   "\t\tworkitem_private_segment_byte_size = %i\n"
-	   "\t\tworkgroup_group_segment_byte_size = %u\n"
-	   "\t\tgds_segment_byte_size = 0\n"
-	   "\t\tkernarg_segment_byte_size = %i\n"
-	   "\t\tworkgroup_fbarrier_count = 0\n"
-	   "\t\twavefront_sgpr_count = %i\n"
-	   "\t\tworkitem_vgpr_count = %i\n"
-	   "\t\treserved_vgpr_first = 0\n"
-	   "\t\treserved_vgpr_count = 0\n"
-	   "\t\treserved_sgpr_first = 0\n"
-	   "\t\treserved_sgpr_count = 0\n"
-	   "\t\tdebug_wavefront_private_segment_offset_sgpr = 0\n"
-	   "\t\tdebug_private_segment_buffer_sgpr = 0\n"
-	   "\t\tkernarg_segment_alignment = %i\n"
-	   "\t\tgroup_segment_alignment = 4\n"
-	   "\t\tprivate_segment_alignment = %i\n"
-	   "\t\twavefront_size = 6\n"
-	   "\t\tcall_convention = 0\n"
-	   "\t\truntime_loader_kernel_symbol = 0\n"
-	   "\t.end_amd_kernel_code_t\n", xnack_enabled,
+  fprintf (file,
+	   "\t  .amdhsa_next_free_vgpr\t%i\n"
+	   "\t  .amdhsa_next_free_sgpr\t%i\n"
+	   "\t  .amdhsa_reserve_vcc\t1\n"
+	   "\t  .amdhsa_reserve_flat_scratch\t0\n"
+	   "\t  .amdhsa_reserve_xnack_mask\t%i\n"
+	   "\t  .amdhsa_private_segment_fixed_size\t%i\n"
+	   "\t  .amdhsa_group_segment_fixed_size\t%u\n"
+	   "\t  .amdhsa_float_denorm_mode_32\t3\n"
+	   "\t  .amdhsa_float_denorm_mode_16_64\t3\n",
+	   vgpr,
+	   sgpr,
+	   xnack_enabled,
 	   /* workitem_private_segment_bytes_size needs to be
 	      one 64th the wave-front stack size.  */
 	   stack_size_opt / 64,
-	   LDS_SIZE, cfun->machine->kernarg_segment_byte_size,
-	   /* Number of scalar registers used by a wavefront.  This
-	      includes the special SGPRs for VCC, Flat Scratch (Base,
-	      Size) and XNACK (for GFX8 (VI)+).  It does not include the
-	      16 SGPR added if a trap handler is enabled.  Must match
-	      compute_pgm_rsrc1.sgprs.  */
-	   sgpr + extra_regs, vgpr,
+	   LDS_SIZE);
+  fputs ("\t.end_amdhsa_kernel\n", file);
+
+#if 1
+  /* The following is YAML embedded in assembler; tabs are not allowed.  */
+  fputs ("        .amdgpu_metadata\n"
+	 "        amdhsa.version:\n"
+	 "          - 1\n"
+	 "          - 0\n"
+	 "        amdhsa.kernels:\n"
+	 "          - .name: ", file);
+  assemble_name (file, name);
+  fputs ("\n            .symbol: ", file);
+  assemble_name (file, name);
+  fprintf (file,
+	   ".kd\n"
+	   "            .kernarg_segment_size: %i\n"
+	   "            .kernarg_segment_align: %i\n"
+	   "            .group_segment_fixed_size: %u\n"
+	   "            .private_segment_fixed_size: %i\n"
+	   "            .wavefront_size: 64\n"
+	   "            .sgpr_count: %i\n"
+	   "            .vgpr_count: %i\n"
+	   "            .max_flat_workgroup_size: 1024\n",
+	   cfun->machine->kernarg_segment_byte_size,
 	   cfun->machine->kernarg_segment_alignment,
-	   crtl->stack_alignment_needed / 8);
+	   LDS_SIZE,
+	   stack_size_opt / 64,
+	   sgpr, vgpr);
+  fputs ("        .end_amdgpu_metadata\n", file);
+#endif
+
+  fputs ("\t.text\n", file);
+  fputs ("\t.align\t256\n", file);
+  fputs ("\t.type\t", file);
+  assemble_name (file, name);
+  fputs (",@function\n", file);
+  assemble_name (file, name);
+  fputs (":\n", file);

  /* This comment is read by mkoffload.  */
  if (flag_openacc)
@ -5200,11 +5162,6 @@ gcn_target_asm_function_prologue (FILE *file)
      asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars);
      asm_fprintf (file, "\t; outgoing args size: %wd\n",
 		   offsets->outgoing_args_size);
-
-      /* Enable denorms.  */
-      asm_fprintf (file, "\n\t; Set MODE[FP_DENORM]: allow single and double"
-		   " input and output denorms\n");
-      asm_fprintf (file, "\ts_setreg_imm32_b32\thwreg(1, 4, 4), 0xf\n\n");
    }
 }

--- a/gcc/config/gcn/gcn.h
+++ b/gcc/config/gcn/gcn.h
@ -525,7 +525,7 @@ enum gcn_address_spaces

 #ifndef USED_FOR_TARGET

-#define GCN_KERNEL_ARG_TYPES 19
+#define GCN_KERNEL_ARG_TYPES 16
 struct GTY(()) gcn_kernel_args
 {
  long requested;
--- a/gcc/config/gcn/gcn.opt
+++ b/gcc/config/gcn/gcn.opt
@ -29,10 +29,10 @@ EnumValue
 Enum(gpu_type) String(fiji) Value(PROCESSOR_FIJI)

 EnumValue
-Enum(gpu_type) String(gfx900) Value(PROCESSOR_VEGA)
+Enum(gpu_type) String(gfx900) Value(PROCESSOR_VEGA10)

 EnumValue
-Enum(gpu_type) String(gfx906) Value(PROCESSOR_VEGA)
+Enum(gpu_type) String(gfx906) Value(PROCESSOR_VEGA20)

 march=
 Target RejectNegative Joined ToLower Enum(gpu_type) Var(gcn_arch) Init(PROCESSOR_FIJI)
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@ -1074,7 +1074,7 @@ init_environment_variables (void)

  hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
  if (hsa_runtime_lib == NULL)
-    hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so";
+    hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so.1";

  support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");

@ -1137,40 +1137,6 @@ get_executable_symbol_name (hsa_executable_symbol_t symbol)
  return res;
 }

-/* Helper function for find_executable_symbol.  */
-
-static hsa_status_t
-find_executable_symbol_1 (hsa_executable_t executable,
-			  hsa_executable_symbol_t symbol,
-			  void *data)
-{
-  hsa_executable_symbol_t *res = (hsa_executable_symbol_t *)data;
-  *res = symbol;
-  return HSA_STATUS_INFO_BREAK;
-}
-
-/* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true.  If not
-   found, return false.  */
-
-static bool
-find_executable_symbol (hsa_executable_t executable,
-			hsa_executable_symbol_t *symbol)
-{
-  hsa_status_t status;
-
-  status
-    = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
-						 find_executable_symbol_1,
-						 symbol);
-  if (status != HSA_STATUS_INFO_BREAK)
-    {
-      hsa_error ("Could not find executable symbol", status);
-      return false;
-    }
-
-  return true;
-}
-
 /* Get the number of GPU Compute Units.  */

 static int
@ -2007,13 +1973,15 @@ init_kernel_properties (struct kernel_info *kernel)
  hsa_status_t status;
  struct agent_info *agent = kernel->agent;
  hsa_executable_symbol_t kernel_symbol;
+  char *buf = alloca (strlen (kernel->name) + 4);
+  sprintf (buf, "%s.kd", kernel->name);
  status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
-						 kernel->name, agent->id,
+						 buf, agent->id,
 						 0, &kernel_symbol);
  if (status != HSA_STATUS_SUCCESS)
    {
      hsa_warn ("Could not find symbol for kernel in the code object", status);
-      fprintf (stderr, "not found name: '%s'\n", kernel->name);
+      fprintf (stderr, "not found name: '%s'\n", buf);
      dump_executable_symbols (agent->executable);
      goto failure;
    }
@ -2327,61 +2295,6 @@ init_basic_kernel_info (struct kernel_info *kernel,
  return true;
 }

-/* Find the load_offset for MODULE, save to *LOAD_OFFSET, and return true.  If
-   not found, return false.  */
-
-static bool
-find_load_offset (Elf64_Addr *load_offset, struct agent_info *agent,
-		  struct module_info *module, Elf64_Ehdr *image,
-		  Elf64_Shdr *sections)
-{
-  bool res = false;
-
-  hsa_status_t status;
-
-  hsa_executable_symbol_t symbol;
-  if (!find_executable_symbol (agent->executable, &symbol))
-    return false;
-
-  status = hsa_fns.hsa_executable_symbol_get_info_fn
-    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, load_offset);
-  if (status != HSA_STATUS_SUCCESS)
-    {
-      hsa_error ("Could not extract symbol address", status);
-      return false;
-    }
-
-  char *symbol_name = get_executable_symbol_name (symbol);
-  if (symbol_name == NULL)
-    return false;
-
-  /* Find the kernel function in ELF, and calculate actual load offset.  */
-  for (int i = 0; i < image->e_shnum; i++)
-    if (sections[i].sh_type == SHT_SYMTAB)
-      {
-	Elf64_Shdr *strtab = &sections[sections[i].sh_link];
-	char *strings = (char *)image + strtab->sh_offset;
-
-	for (size_t offset = 0;
-	     offset < sections[i].sh_size;
-	     offset += sections[i].sh_entsize)
-	  {
-	    Elf64_Sym *sym = (Elf64_Sym*)((char*)image
-					  + sections[i].sh_offset
-					  + offset);
-	    if (strcmp (symbol_name, strings + sym->st_name) == 0)
-	      {
-		*load_offset -= sym->st_value;
-		res = true;
-		break;
-	      }
-	  }
-      }
-
-  free (symbol_name);
-  return res;
-}
-
 /* Check that the GCN ISA of the given image matches the ISA of the agent. */

 static bool
@ -2421,7 +2334,6 @@ static bool
 create_and_finalize_hsa_program (struct agent_info *agent)
 {
  hsa_status_t status;
-  int reloc_count = 0;
  bool res = true;
  if (pthread_mutex_lock (&agent->prog_mutex))
    {
@ -2450,18 +2362,6 @@ create_and_finalize_hsa_program (struct agent_info *agent)
      if (!isa_matches_agent (agent, image))
 	goto fail;

-      /* Hide relocations from the HSA runtime loader.
-	 Keep a copy of the unmodified section headers to use later.  */
-      Elf64_Shdr *image_sections = (Elf64_Shdr *)((char *)image
-						  + image->e_shoff);
-      for (int i = image->e_shnum - 1; i >= 0; i--)
-	{
-	  if (image_sections[i].sh_type == SHT_RELA
-	      || image_sections[i].sh_type == SHT_REL)
-	    /* Change section type to something harmless.  */
-	    image_sections[i].sh_type |= 0x80;
-	}
-
      hsa_code_object_t co = { 0 };
      status = hsa_fns.hsa_code_object_deserialize_fn
 	(module->image_desc->gcn_image->image,
@ -2517,131 +2417,6 @@ create_and_finalize_hsa_program (struct agent_info *agent)
      goto fail;
    }

-  if (agent->module)
-    {
-      struct module_info *module = agent->module;
-      Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
-      Elf64_Shdr *sections = (Elf64_Shdr *)((char *)image + image->e_shoff);
-
-      Elf64_Addr load_offset;
-      if (!find_load_offset (&load_offset, agent, module, image, sections))
-	goto fail;
-
-      /* Record the physical load address range.
-	 We need this for data copies later.  */
-      Elf64_Phdr *segments = (Elf64_Phdr *)((char*)image + image->e_phoff);
-      Elf64_Addr low = ~0, high = 0;
-      for (int i = 0; i < image->e_phnum; i++)
-	if (segments[i].p_memsz > 0)
-	  {
-	    if (segments[i].p_paddr < low)
-	      low = segments[i].p_paddr;
-	    if (segments[i].p_paddr > high)
-	      high = segments[i].p_paddr + segments[i].p_memsz - 1;
-	  }
-      module->phys_address_start = low + load_offset;
-      module->phys_address_end = high + load_offset;
-
-      // Find dynamic symbol table
-      Elf64_Shdr *dynsym = NULL;
-      for (int i = 0; i < image->e_shnum; i++)
-	if (sections[i].sh_type == SHT_DYNSYM)
-	  {
-	    dynsym = &sections[i];
-	    break;
-	  }
-
-      /* Fix up relocations.  */
-      for (int i = 0; i < image->e_shnum; i++)
-	{
-	  if (sections[i].sh_type == (SHT_RELA | 0x80))
-	    for (size_t offset = 0;
-		 offset < sections[i].sh_size;
-		 offset += sections[i].sh_entsize)
-	      {
-		Elf64_Rela *reloc = (Elf64_Rela*)((char*)image
-						  + sections[i].sh_offset
-						  + offset);
-		Elf64_Sym *sym =
-		  (dynsym
-		   ? (Elf64_Sym*)((char*)image
-				  + dynsym->sh_offset
-				  + (dynsym->sh_entsize
-				     * ELF64_R_SYM (reloc->r_info)))
-		   : NULL);
-
-		int64_t S = (sym ? sym->st_value : 0);
-		int64_t P = reloc->r_offset + load_offset;
-		int64_t A = reloc->r_addend;
-		int64_t B = load_offset;
-		int64_t V, size;
-		switch (ELF64_R_TYPE (reloc->r_info))
-		  {
-		  case R_AMDGPU_ABS32_LO:
-		    V = (S + A) & 0xFFFFFFFF;
-		    size = 4;
-		    break;
-		  case R_AMDGPU_ABS32_HI:
-		    V = (S + A) >> 32;
-		    size = 4;
-		    break;
-		  case R_AMDGPU_ABS64:
-		    V = S + A;
-		    size = 8;
-		    break;
-		  case R_AMDGPU_REL32:
-		    V = S + A - P;
-		    size = 4;
-		    break;
-		  case R_AMDGPU_REL64:
-		    /* FIXME
-		       LLD seems to emit REL64 where the the assembler has
-		       ABS64.  This is clearly wrong because it's not what the
-		       compiler is expecting.  Let's assume, for now, that
-		       it's a bug.  In any case, GCN kernels are always self
-		       contained and therefore relative relocations will have
-		       been resolved already, so this should be a safe
-		       workaround.  */
-		    V = S + A/* - P*/;
-		    size = 8;
-		    break;
-		  case R_AMDGPU_ABS32:
-		    V = S + A;
-		    size = 4;
-		    break;
-		    /* TODO R_AMDGPU_GOTPCREL */
-		    /* TODO R_AMDGPU_GOTPCREL32_LO */
-		    /* TODO R_AMDGPU_GOTPCREL32_HI */
-		  case R_AMDGPU_REL32_LO:
-		    V = (S + A - P) & 0xFFFFFFFF;
-		    size = 4;
-		    break;
-		  case R_AMDGPU_REL32_HI:
-		    V = (S + A - P) >> 32;
-		    size = 4;
-		    break;
-		  case R_AMDGPU_RELATIVE64:
-		    V = B + A;
-		    size = 8;
-		    break;
-		  default:
-		    fprintf (stderr, "Error: unsupported relocation type.\n");
-		    exit (1);
-		  }
-		status = hsa_fns.hsa_memory_copy_fn ((void*)P, &V, size);
-		if (status != HSA_STATUS_SUCCESS)
-		  {
-		    hsa_error ("Failed to fix up relocation", status);
-		    goto fail;
-		  }
-		reloc_count++;
-	      }
-	}
-    }
-
-  GCN_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n",
-	     agent->device_id, reloc_count);
-
 final:
  agent->prog_finalized = true;