amd/common: switch to 3-spaces style
authorPierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Mon, 7 Sep 2020 07:58:36 +0000 (09:58 +0200)
committerVivek Pandya <vivekvpandya@gmail.com>
Mon, 7 Sep 2020 15:55:16 +0000 (21:25 +0530)
Follow-up of !4319 using the same clang-format config.

Acked-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Acked-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5310>

22 files changed:
src/amd/common/.editorconfig [deleted file]
src/amd/common/ac_binary.c
src/amd/common/ac_binary.h
src/amd/common/ac_debug.c
src/amd/common/ac_debug.h
src/amd/common/ac_exp_param.h
src/amd/common/ac_gpu_info.c
src/amd/common/ac_gpu_info.h
src/amd/common/ac_rtld.c
src/amd/common/ac_rtld.h
src/amd/common/ac_shader_args.c
src/amd/common/ac_shader_args.h
src/amd/common/ac_shader_util.c
src/amd/common/ac_shader_util.h
src/amd/common/ac_shadowed_regs.c
src/amd/common/ac_shadowed_regs.h
src/amd/common/ac_surface.c
src/amd/common/ac_surface.h
src/amd/common/amd_family.h
src/amd/common/amd_kernel_code_t.h
src/amd/common/gfx10_format_table.h
src/amd/common/sid.h

diff --git a/src/amd/common/.editorconfig b/src/amd/common/.editorconfig
deleted file mode 100644 (file)
index 21a3c7d..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-[*.{c,h}]
-indent_style = tab
-indent_size = tab
index 4651c064abd5731f807f8af08ecf243226790da6..93068696c3272f5156c0525d813bdb96cd9d3695 100644 (file)
  * SOFTWARE.
  */
 
-#include "ac_gpu_info.h"
 #include "ac_binary.h"
 
+#include "ac_gpu_info.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
 #include <gelf.h>
 #include <libelf.h>
-#include <stdio.h>
-
 #include <sid.h>
+#include <stdio.h>
 
-#define SPILLED_SGPRS                                     0x4
-#define SPILLED_VGPRS                                     0x8
+#define SPILLED_SGPRS 0x4
+#define SPILLED_VGPRS 0x8
 
 /* Parse configuration data in .AMDGPU.config section format. */
-void ac_parse_shader_binary_config(const char *data, size_t nbytes,
-                                  unsigned wave_size,
-                                  bool really_needs_scratch,
-                                  const struct radeon_info *info,
-                                  struct ac_shader_config *conf)
+void ac_parse_shader_binary_config(const char *data, size_t nbytes, unsigned wave_size,
+                                   bool really_needs_scratch, const struct radeon_info *info,
+                                   struct ac_shader_config *conf)
 {
-       uint32_t scratch_size = 0;
+   uint32_t scratch_size = 0;
 
-       for (size_t i = 0; i < nbytes; i += 8) {
-               unsigned reg = util_le32_to_cpu(*(uint32_t*)(data + i));
-               unsigned value = util_le32_to_cpu(*(uint32_t*)(data + i + 4));
-               switch (reg) {
-               case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
-               case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
-               case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
-               case R_00B848_COMPUTE_PGM_RSRC1:
-               case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
-                       if (wave_size == 32)
-                               conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 8);
-                       else
-                               conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
+   for (size_t i = 0; i < nbytes; i += 8) {
+      unsigned reg = util_le32_to_cpu(*(uint32_t *)(data + i));
+      unsigned value = util_le32_to_cpu(*(uint32_t *)(data + i + 4));
+      switch (reg) {
+      case R_00B028_SPI_SHADER_PGM_RSRC1_PS:
+      case R_00B128_SPI_SHADER_PGM_RSRC1_VS:
+      case R_00B228_SPI_SHADER_PGM_RSRC1_GS:
+      case R_00B848_COMPUTE_PGM_RSRC1:
+      case R_00B428_SPI_SHADER_PGM_RSRC1_HS:
+         if (wave_size == 32)
+            conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 8);
+         else
+            conf->num_vgprs = MAX2(conf->num_vgprs, (G_00B028_VGPRS(value) + 1) * 4);
 
-                       conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
-                       /* TODO: LLVM doesn't set FLOAT_MODE for non-compute shaders */
-                       conf->float_mode =  G_00B028_FLOAT_MODE(value);
-                       conf->rsrc1 = value;
-                       break;
-               case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
-                       conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
-                       /* TODO: LLVM doesn't set SHARED_VGPR_CNT for all shader types */
-                       conf->num_shared_vgprs = G_00B02C_SHARED_VGPR_CNT(value);
-                       conf->rsrc2 = value;
-                       break;
-               case R_00B12C_SPI_SHADER_PGM_RSRC2_VS:
-                       conf->num_shared_vgprs = G_00B12C_SHARED_VGPR_CNT(value);
-                       conf->rsrc2 = value;
-                       break;
-               case R_00B22C_SPI_SHADER_PGM_RSRC2_GS:
-                       conf->num_shared_vgprs = G_00B22C_SHARED_VGPR_CNT(value);
-                       conf->rsrc2 = value;
-                       break;
-               case R_00B42C_SPI_SHADER_PGM_RSRC2_HS:
-                       conf->num_shared_vgprs = G_00B42C_SHARED_VGPR_CNT(value);
-                       conf->rsrc2 = value;
-                       break;
-               case R_00B84C_COMPUTE_PGM_RSRC2:
-                       conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
-                       conf->rsrc2 = value;
-                       break;
-               case R_00B8A0_COMPUTE_PGM_RSRC3:
-                       conf->num_shared_vgprs = G_00B8A0_SHARED_VGPR_CNT(value);
-                       conf->rsrc3 = value;
-                       break;
-               case R_0286CC_SPI_PS_INPUT_ENA:
-                       conf->spi_ps_input_ena = value;
-                       break;
-               case R_0286D0_SPI_PS_INPUT_ADDR:
-                       conf->spi_ps_input_addr = value;
-                       break;
-               case R_0286E8_SPI_TMPRING_SIZE:
-               case R_00B860_COMPUTE_TMPRING_SIZE:
-                       /* WAVESIZE is in units of 256 dwords. */
-                       scratch_size = value;
-                       break;
-               case SPILLED_SGPRS:
-                       conf->spilled_sgprs = value;
-                       break;
-               case SPILLED_VGPRS:
-                       conf->spilled_vgprs = value;
-                       break;
-               default:
-                       {
-                               static bool printed;
+         conf->num_sgprs = MAX2(conf->num_sgprs, (G_00B028_SGPRS(value) + 1) * 8);
+         /* TODO: LLVM doesn't set FLOAT_MODE for non-compute shaders */
+         conf->float_mode = G_00B028_FLOAT_MODE(value);
+         conf->rsrc1 = value;
+         break;
+      case R_00B02C_SPI_SHADER_PGM_RSRC2_PS:
+         conf->lds_size = MAX2(conf->lds_size, G_00B02C_EXTRA_LDS_SIZE(value));
+         /* TODO: LLVM doesn't set SHARED_VGPR_CNT for all shader types */
+         conf->num_shared_vgprs = G_00B02C_SHARED_VGPR_CNT(value);
+         conf->rsrc2 = value;
+         break;
+      case R_00B12C_SPI_SHADER_PGM_RSRC2_VS:
+         conf->num_shared_vgprs = G_00B12C_SHARED_VGPR_CNT(value);
+         conf->rsrc2 = value;
+         break;
+      case R_00B22C_SPI_SHADER_PGM_RSRC2_GS:
+         conf->num_shared_vgprs = G_00B22C_SHARED_VGPR_CNT(value);
+         conf->rsrc2 = value;
+         break;
+      case R_00B42C_SPI_SHADER_PGM_RSRC2_HS:
+         conf->num_shared_vgprs = G_00B42C_SHARED_VGPR_CNT(value);
+         conf->rsrc2 = value;
+         break;
+      case R_00B84C_COMPUTE_PGM_RSRC2:
+         conf->lds_size = MAX2(conf->lds_size, G_00B84C_LDS_SIZE(value));
+         conf->rsrc2 = value;
+         break;
+      case R_00B8A0_COMPUTE_PGM_RSRC3:
+         conf->num_shared_vgprs = G_00B8A0_SHARED_VGPR_CNT(value);
+         conf->rsrc3 = value;
+         break;
+      case R_0286CC_SPI_PS_INPUT_ENA:
+         conf->spi_ps_input_ena = value;
+         break;
+      case R_0286D0_SPI_PS_INPUT_ADDR:
+         conf->spi_ps_input_addr = value;
+         break;
+      case R_0286E8_SPI_TMPRING_SIZE:
+      case R_00B860_COMPUTE_TMPRING_SIZE:
+         /* WAVESIZE is in units of 256 dwords. */
+         scratch_size = value;
+         break;
+      case SPILLED_SGPRS:
+         conf->spilled_sgprs = value;
+         break;
+      case SPILLED_VGPRS:
+         conf->spilled_vgprs = value;
+         break;
+      default: {
+         static bool printed;
 
-                               if (!printed) {
-                                       fprintf(stderr, "Warning: LLVM emitted unknown "
-                                               "config register: 0x%x\n", reg);
-                                       printed = true;
-                               }
-                       }
-                       break;
-               }
-       }
+         if (!printed) {
+            fprintf(stderr,
+                    "Warning: LLVM emitted unknown "
+                    "config register: 0x%x\n",
+                    reg);
+            printed = true;
+         }
+      } break;
+      }
+   }
 
-       if (!conf->spi_ps_input_addr)
-               conf->spi_ps_input_addr = conf->spi_ps_input_ena;
+   if (!conf->spi_ps_input_addr)
+      conf->spi_ps_input_addr = conf->spi_ps_input_ena;
 
-       if (really_needs_scratch) {
-               /* sgprs spills aren't spilling */
-               conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(scratch_size) * 256 * 4;
-       }
+   if (really_needs_scratch) {
+      /* sgprs spills aren't spilling */
+      conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(scratch_size) * 256 * 4;
+   }
 
-       /* GFX 10.3 internally:
-        * - aligns VGPRS to 16 for Wave32 and 8 for Wave64
-        * - aligns LDS to 1024
-        *
-        * For shader-db stats, set num_vgprs that the hw actually uses.
-        */
-       if (info->chip_class >= GFX10_3) {
-               conf->num_vgprs = align(conf->num_vgprs, wave_size == 32 ? 16 : 8);
-       }
+   /* GFX 10.3 internally:
+    * - aligns VGPRS to 16 for Wave32 and 8 for Wave64
+    * - aligns LDS to 1024
+    *
+    * For shader-db stats, set num_vgprs that the hw actually uses.
+    */
+   if (info->chip_class >= GFX10_3) {
+      conf->num_vgprs = align(conf->num_vgprs, wave_size == 32 ? 16 : 8);
+   }
 
-       /* Enable 64-bit and 16-bit denormals, because there is no performance
-        * cost.
-        *
-        * Don't enable denormals for 32-bit floats, because:
-        * - denormals disable output modifiers
-        * - denormals break v_mad_f32
-        * - GFX6 & GFX7 would be very slow
-        */
-       conf->float_mode &= ~V_00B028_FP_ALL_DENORMS;
-       conf->float_mode |= V_00B028_FP_64_DENORMS;
+   /* Enable 64-bit and 16-bit denormals, because there is no performance
+    * cost.
+    *
+    * Don't enable denormals for 32-bit floats, because:
+    * - denormals disable output modifiers
+    * - denormals break v_mad_f32
+    * - GFX6 & GFX7 would be very slow
+    */
+   conf->float_mode &= ~V_00B028_FP_ALL_DENORMS;
+   conf->float_mode |= V_00B028_FP_64_DENORMS;
 }
index 0d981423696a061e37935e816d0e4f378e4cf94f..5eae2d50baa2300218a5e2fe7ed447909800d293 100644 (file)
@@ -24,9 +24,9 @@
 #ifndef AC_BINARY_H
 #define AC_BINARY_H
 
+#include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
-#include <stdbool.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -35,26 +35,24 @@ extern "C" {
 struct radeon_info;
 
 struct ac_shader_config {
-       unsigned num_sgprs;
-       unsigned num_vgprs;
-       unsigned num_shared_vgprs; /* GFX10: number of VGPRs shared between half-waves */
-       unsigned spilled_sgprs;
-       unsigned spilled_vgprs;
-       unsigned lds_size; /* in HW allocation units; i.e 256 bytes on SI, 512 bytes on CI+ */
-       unsigned spi_ps_input_ena;
-       unsigned spi_ps_input_addr;
-       unsigned float_mode;
-       unsigned scratch_bytes_per_wave;
-       unsigned rsrc1;
-       unsigned rsrc2;
-       unsigned rsrc3;
+   unsigned num_sgprs;
+   unsigned num_vgprs;
+   unsigned num_shared_vgprs; /* GFX10: number of VGPRs shared between half-waves */
+   unsigned spilled_sgprs;
+   unsigned spilled_vgprs;
+   unsigned lds_size; /* in HW allocation units; i.e 256 bytes on SI, 512 bytes on CI+ */
+   unsigned spi_ps_input_ena;
+   unsigned spi_ps_input_addr;
+   unsigned float_mode;
+   unsigned scratch_bytes_per_wave;
+   unsigned rsrc1;
+   unsigned rsrc2;
+   unsigned rsrc3;
 };
 
-void ac_parse_shader_binary_config(const char *data, size_t nbytes,
-                                  unsigned wave_size,
-                                  bool really_needs_scratch,
-                                  const struct radeon_info *info,
-                                  struct ac_shader_config *conf);
+void ac_parse_shader_binary_config(const char *data, size_t nbytes, unsigned wave_size,
+                                   bool really_needs_scratch, const struct radeon_info *info,
+                                   struct ac_shader_config *conf);
 
 #ifdef __cplusplus
 }
index f095b0b45dabeb40fd43aab6695d56bc1926ce8b..bbaed82c4928610e1dee63749f78170f2ea3f9aa 100644 (file)
 #include "ac_debug.h"
 
 #ifdef HAVE_VALGRIND
-#include <valgrind.h>
 #include <memcheck.h>
+#include <valgrind.h>
 #define VG(x) x
 #else
 #define VG(x) ((void)0)
 #endif
 
-#include <inttypes.h>
-
 #include "sid.h"
 #include "sid_tables.h"
 #include "util/u_math.h"
 #include "util/u_string.h"
 
 #include <assert.h>
+#include <inttypes.h>
 
 /* Parsed IBs are difficult to read without colors. Use "less -R file" to
  * read them, or use "aha -b -f file" to convert them to html.
  */
-#define COLOR_RESET    "\033[0m"
-#define COLOR_RED      "\033[31m"
-#define COLOR_GREEN    "\033[1;32m"
-#define COLOR_YELLOW   "\033[1;33m"
-#define COLOR_CYAN     "\033[1;36m"
+#define COLOR_RESET  "\033[0m"
+#define COLOR_RED    "\033[31m"
+#define COLOR_GREEN  "\033[1;32m"
+#define COLOR_YELLOW "\033[1;33m"
+#define COLOR_CYAN   "\033[1;36m"
 
 #define INDENT_PKT 8
 
 struct ac_ib_parser {
-       FILE *f;
-       uint32_t *ib;
-       unsigned num_dw;
-       const int *trace_ids;
-       unsigned trace_id_count;
-       enum chip_class chip_class;
-       ac_debug_addr_callback addr_callback;
-       void *addr_callback_data;
-
-       unsigned cur_dw;
+   FILE *f;
+   uint32_t *ib;
+   unsigned num_dw;
+   const int *trace_ids;
+   unsigned trace_id_count;
+   enum chip_class chip_class;
+   ac_debug_addr_callback addr_callback;
+   void *addr_callback_data;
+
+   unsigned cur_dw;
 };
 
 static void ac_do_parse_ib(FILE *f, struct ac_ib_parser *ib);
 
 static void print_spaces(FILE *f, unsigned num)
 {
-       fprintf(f, "%*s", num, "");
+   fprintf(f, "%*s", num, "");
 }
 
 static void print_value(FILE *file, uint32_t value, int bits)
 {
-       /* Guess if it's int or float */
-       if (value <= (1 << 15)) {
-               if (value <= 9)
-                       fprintf(file, "%u\n", value);
-               else
-                       fprintf(file, "%u (0x%0*x)\n", value, bits / 4, value);
-       } else {
-               float f = uif(value);
-
-               if (fabs(f) < 100000 && f*10 == floor(f*10))
-                       fprintf(file, "%.1ff (0x%0*x)\n", f, bits / 4, value);
-               else
-                       /* Don't print more leading zeros than there are bits. */
-                       fprintf(file, "0x%0*x\n", bits / 4, value);
-       }
+   /* Guess if it's int or float */
+   if (value <= (1 << 15)) {
+      if (value <= 9)
+         fprintf(file, "%u\n", value);
+      else
+         fprintf(file, "%u (0x%0*x)\n", value, bits / 4, value);
+   } else {
+      float f = uif(value);
+
+      if (fabs(f) < 100000 && f * 10 == floor(f * 10))
+         fprintf(file, "%.1ff (0x%0*x)\n", f, bits / 4, value);
+      else
+         /* Don't print more leading zeros than there are bits. */
+         fprintf(file, "0x%0*x\n", bits / 4, value);
+   }
 }
 
-static void print_named_value(FILE *file, const char *name, uint32_t value,
-                             int bits)
+static void print_named_value(FILE *file, const char *name, uint32_t value, int bits)
 {
-       print_spaces(file, INDENT_PKT);
-       fprintf(file, COLOR_YELLOW "%s" COLOR_RESET " <- ", name);
-       print_value(file, value, bits);
+   print_spaces(file, INDENT_PKT);
+   fprintf(file, COLOR_YELLOW "%s" COLOR_RESET " <- ", name);
+   print_value(file, value, bits);
 }
 
 static const struct si_reg *find_register(enum chip_class chip_class, unsigned offset)
 {
-       const struct si_reg *table;
-       unsigned table_size;
-
-       switch (chip_class) {
-       case GFX10_3:
-       case GFX10:
-               table = gfx10_reg_table;
-               table_size = ARRAY_SIZE(gfx10_reg_table);
-               break;
-       case GFX9:
-               table = gfx9_reg_table;
-               table_size = ARRAY_SIZE(gfx9_reg_table);
-               break;
-       case GFX8:
-               table = gfx8_reg_table;
-               table_size = ARRAY_SIZE(gfx8_reg_table);
-               break;
-       case GFX7:
-               table = gfx7_reg_table;
-               table_size = ARRAY_SIZE(gfx7_reg_table);
-               break;
-       case GFX6:
-               table = gfx6_reg_table;
-               table_size = ARRAY_SIZE(gfx6_reg_table);
-               break;
-       default:
-               return NULL;
-       }
-
-       for (unsigned i = 0; i < table_size; i++) {
-               const struct si_reg *reg = &table[i];
-
-               if (reg->offset == offset)
-                       return reg;
-       }
-
-       return NULL;
+   const struct si_reg *table;
+   unsigned table_size;
+
+   switch (chip_class) {
+   case GFX10_3:
+   case GFX10:
+      table = gfx10_reg_table;
+      table_size = ARRAY_SIZE(gfx10_reg_table);
+      break;
+   case GFX9:
+      table = gfx9_reg_table;
+      table_size = ARRAY_SIZE(gfx9_reg_table);
+      break;
+   case GFX8:
+      table = gfx8_reg_table;
+      table_size = ARRAY_SIZE(gfx8_reg_table);
+      break;
+   case GFX7:
+      table = gfx7_reg_table;
+      table_size = ARRAY_SIZE(gfx7_reg_table);
+      break;
+   case GFX6:
+      table = gfx6_reg_table;
+      table_size = ARRAY_SIZE(gfx6_reg_table);
+      break;
+   default:
+      return NULL;
+   }
+
+   for (unsigned i = 0; i < table_size; i++) {
+      const struct si_reg *reg = &table[i];
+
+      if (reg->offset == offset)
+         return reg;
+   }
+
+   return NULL;
 }
 
 const char *ac_get_register_name(enum chip_class chip_class, unsigned offset)
 {
-       const struct si_reg *reg = find_register(chip_class, offset);
+   const struct si_reg *reg = find_register(chip_class, offset);
 
-       return reg ? sid_strings + reg->name_offset : "(no name)";
+   return reg ? sid_strings + reg->name_offset : "(no name)";
 }
 
-void ac_dump_reg(FILE *file, enum chip_class chip_class, unsigned offset,
-                uint32_t value, uint32_t field_mask)
+void ac_dump_reg(FILE *file, enum chip_class chip_class, unsigned offset, uint32_t value,
+                 uint32_t field_mask)
 {
-       const struct si_reg *reg = find_register(chip_class, offset);
-
-       if (reg) {
-               const char *reg_name = sid_strings + reg->name_offset;
-               bool first_field = true;
-
-               print_spaces(file, INDENT_PKT);
-               fprintf(file, COLOR_YELLOW "%s" COLOR_RESET " <- ",
-                       reg_name);
-
-               if (!reg->num_fields) {
-                       print_value(file, value, 32);
-                       return;
-               }
-
-               for (unsigned f = 0; f < reg->num_fields; f++) {
-                       const struct si_field *field = sid_fields_table + reg->fields_offset + f;
-                       const int *values_offsets = sid_strings_offsets + field->values_offset;
-                       uint32_t val = (value & field->mask) >>
-                                      (ffs(field->mask) - 1);
-
-                       if (!(field->mask & field_mask))
-                               continue;
-
-                       /* Indent the field. */
-                       if (!first_field)
-                               print_spaces(file,
-                                            INDENT_PKT + strlen(reg_name) + 4);
-
-                       /* Print the field. */
-                       fprintf(file, "%s = ", sid_strings + field->name_offset);
-
-                       if (val < field->num_values && values_offsets[val] >= 0)
-                               fprintf(file, "%s\n", sid_strings + values_offsets[val]);
-                       else
-                               print_value(file, val,
-                                           util_bitcount(field->mask));
-
-                       first_field = false;
-               }
-               return;
-       }
-
-       print_spaces(file, INDENT_PKT);
-       fprintf(file, COLOR_YELLOW "0x%05x" COLOR_RESET " <- 0x%08x\n", offset, value);
+   const struct si_reg *reg = find_register(chip_class, offset);
+
+   if (reg) {
+      const char *reg_name = sid_strings + reg->name_offset;
+      bool first_field = true;
+
+      print_spaces(file, INDENT_PKT);
+      fprintf(file, COLOR_YELLOW "%s" COLOR_RESET " <- ", reg_name);
+
+      if (!reg->num_fields) {
+         print_value(file, value, 32);
+         return;
+      }
+
+      for (unsigned f = 0; f < reg->num_fields; f++) {
+         const struct si_field *field = sid_fields_table + reg->fields_offset + f;
+         const int *values_offsets = sid_strings_offsets + field->values_offset;
+         uint32_t val = (value & field->mask) >> (ffs(field->mask) - 1);
+
+         if (!(field->mask & field_mask))
+            continue;
+
+         /* Indent the field. */
+         if (!first_field)
+            print_spaces(file, INDENT_PKT + strlen(reg_name) + 4);
+
+         /* Print the field. */
+         fprintf(file, "%s = ", sid_strings + field->name_offset);
+
+         if (val < field->num_values && values_offsets[val] >= 0)
+            fprintf(file, "%s\n", sid_strings + values_offsets[val]);
+         else
+            print_value(file, val, util_bitcount(field->mask));
+
+         first_field = false;
+      }
+      return;
+   }
+
+   print_spaces(file, INDENT_PKT);
+   fprintf(file, COLOR_YELLOW "0x%05x" COLOR_RESET " <- 0x%08x\n", offset, value);
 }
 
 static uint32_t ac_ib_get(struct ac_ib_parser *ib)
 {
-       uint32_t v = 0;
+   uint32_t v = 0;
 
-       if (ib->cur_dw < ib->num_dw) {
-               v = ib->ib[ib->cur_dw];
+   if (ib->cur_dw < ib->num_dw) {
+      v = ib->ib[ib->cur_dw];
 #ifdef HAVE_VALGRIND
-               /* Help figure out where garbage data is written to IBs.
-                *
-                * Arguably we should do this already when the IBs are written,
-                * see RADEON_VALGRIND. The problem is that client-requests to
-                * Valgrind have an overhead even when Valgrind isn't running,
-                * and radeon_emit is performance sensitive...
-                */
-               if (VALGRIND_CHECK_VALUE_IS_DEFINED(v))
-                       fprintf(ib->f, COLOR_RED "Valgrind: The next DWORD is garbage"
-                               COLOR_RESET "\n");
+      /* Help figure out where garbage data is written to IBs.
+       *
+       * Arguably we should do this already when the IBs are written,
+       * see RADEON_VALGRIND. The problem is that client-requests to
+       * Valgrind have an overhead even when Valgrind isn't running,
+       * and radeon_emit is performance sensitive...
+       */
+      if (VALGRIND_CHECK_VALUE_IS_DEFINED(v))
+         fprintf(ib->f, COLOR_RED "Valgrind: The next DWORD is garbage" COLOR_RESET "\n");
 #endif
-               fprintf(ib->f, "\n\035#%08x ", v);
-       } else {
-               fprintf(ib->f, "\n\035#???????? ");
-       }
+      fprintf(ib->f, "\n\035#%08x ", v);
+   } else {
+      fprintf(ib->f, "\n\035#???????? ");
+   }
 
-       ib->cur_dw++;
-       return v;
+   ib->cur_dw++;
+   return v;
 }
 
 static void ac_parse_set_reg_packet(FILE *f, unsigned count, unsigned reg_offset,
-                                   struct ac_ib_parser *ib)
+                                    struct ac_ib_parser *ib)
 {
-       unsigned reg_dw = ac_ib_get(ib);
-       unsigned reg = ((reg_dw & 0xFFFF) << 2) + reg_offset;
-       unsigned index = reg_dw >> 28;
-       int i;
-
-       if (index != 0) {
-               print_spaces(f, INDENT_PKT);
-               fprintf(f, "INDEX = %u\n", index);
-       }
-
-       for (i = 0; i < count; i++)
-               ac_dump_reg(f, ib->chip_class, reg + i*4, ac_ib_get(ib), ~0);
+   unsigned reg_dw = ac_ib_get(ib);
+   unsigned reg = ((reg_dw & 0xFFFF) << 2) + reg_offset;
+   unsigned index = reg_dw >> 28;
+   int i;
+
+   if (index != 0) {
+      print_spaces(f, INDENT_PKT);
+      fprintf(f, "INDEX = %u\n", index);
+   }
+
+   for (i = 0; i < count; i++)
+      ac_dump_reg(f, ib->chip_class, reg + i * 4, ac_ib_get(ib), ~0);
 }
 
 static void ac_parse_packet3(FILE *f, uint32_t header, struct ac_ib_parser *ib,
                              int *current_trace_id)
 {
-       unsigned first_dw = ib->cur_dw;
-       int count = PKT_COUNT_G(header);
-       unsigned op = PKT3_IT_OPCODE_G(header);
-       const char *predicate = PKT3_PREDICATE(header) ? "(predicate)" : "";
-       int i;
-
-       /* Print the name first. */
-       for (i = 0; i < ARRAY_SIZE(packet3_table); i++)
-               if (packet3_table[i].op == op)
-                       break;
-
-       if (i < ARRAY_SIZE(packet3_table)) {
-               const char *name = sid_strings + packet3_table[i].name_offset;
-
-               if (op == PKT3_SET_CONTEXT_REG ||
-                   op == PKT3_SET_CONFIG_REG ||
-                   op == PKT3_SET_UCONFIG_REG ||
-                   op == PKT3_SET_UCONFIG_REG_INDEX ||
-                   op == PKT3_SET_SH_REG)
-                       fprintf(f, COLOR_CYAN "%s%s" COLOR_CYAN ":\n",
-                               name, predicate);
-               else
-                       fprintf(f, COLOR_GREEN "%s%s" COLOR_RESET ":\n",
-                               name, predicate);
-       } else
-               fprintf(f, COLOR_RED "PKT3_UNKNOWN 0x%x%s" COLOR_RESET ":\n",
-                       op, predicate);
-
-       /* Print the contents. */
-       switch (op) {
-       case PKT3_SET_CONTEXT_REG:
-               ac_parse_set_reg_packet(f, count, SI_CONTEXT_REG_OFFSET, ib);
-               break;
-       case PKT3_SET_CONFIG_REG:
-               ac_parse_set_reg_packet(f, count, SI_CONFIG_REG_OFFSET, ib);
-               break;
-       case PKT3_SET_UCONFIG_REG:
-       case PKT3_SET_UCONFIG_REG_INDEX:
-               ac_parse_set_reg_packet(f, count, CIK_UCONFIG_REG_OFFSET, ib);
-               break;
-       case PKT3_SET_SH_REG:
-               ac_parse_set_reg_packet(f, count, SI_SH_REG_OFFSET, ib);
-               break;
-       case PKT3_ACQUIRE_MEM:
-               ac_dump_reg(f, ib->chip_class, R_0301F0_CP_COHER_CNTL, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_0301F4_CP_COHER_SIZE, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_030230_CP_COHER_SIZE_HI, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_0301F8_CP_COHER_BASE, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_0301E4_CP_COHER_BASE_HI, ac_ib_get(ib), ~0);
-               print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16);
-               if (ib->chip_class >= GFX10)
-                       ac_dump_reg(f, ib->chip_class, R_586_GCR_CNTL, ac_ib_get(ib), ~0);
-               break;
-       case PKT3_SURFACE_SYNC:
-               if (ib->chip_class >= GFX7) {
-                       ac_dump_reg(f, ib->chip_class, R_0301F0_CP_COHER_CNTL, ac_ib_get(ib), ~0);
-                       ac_dump_reg(f, ib->chip_class, R_0301F4_CP_COHER_SIZE, ac_ib_get(ib), ~0);
-                       ac_dump_reg(f, ib->chip_class, R_0301F8_CP_COHER_BASE, ac_ib_get(ib), ~0);
-               } else {
-                       ac_dump_reg(f, ib->chip_class, R_0085F0_CP_COHER_CNTL, ac_ib_get(ib), ~0);
-                       ac_dump_reg(f, ib->chip_class, R_0085F4_CP_COHER_SIZE, ac_ib_get(ib), ~0);
-                       ac_dump_reg(f, ib->chip_class, R_0085F8_CP_COHER_BASE, ac_ib_get(ib), ~0);
-               }
-               print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16);
-               break;
-       case PKT3_EVENT_WRITE: {
-               uint32_t event_dw = ac_ib_get(ib);
-               ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw,
-                           S_028A90_EVENT_TYPE(~0));
-               print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4);
-               print_named_value(f, "INV_L2", (event_dw >> 20) & 0x1, 1);
-               if (count > 0) {
-                       print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32);
-                       print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 16);
-               }
-               break;
-       }
-       case PKT3_EVENT_WRITE_EOP: {
-               uint32_t event_dw = ac_ib_get(ib);
-               ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw,
-                           S_028A90_EVENT_TYPE(~0));
-               print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4);
-               print_named_value(f, "TCL1_VOL_ACTION_ENA", (event_dw >> 12) & 0x1, 1);
-               print_named_value(f, "TC_VOL_ACTION_ENA", (event_dw >> 13) & 0x1, 1);
-               print_named_value(f, "TC_WB_ACTION_ENA", (event_dw >> 15) & 0x1, 1);
-               print_named_value(f, "TCL1_ACTION_ENA", (event_dw >> 16) & 0x1, 1);
-               print_named_value(f, "TC_ACTION_ENA", (event_dw >> 17) & 0x1, 1);
-               print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32);
-               uint32_t addr_hi_dw = ac_ib_get(ib);
-               print_named_value(f, "ADDRESS_HI", addr_hi_dw, 16);
-               print_named_value(f, "DST_SEL", (addr_hi_dw >> 16) & 0x3, 2);
-               print_named_value(f, "INT_SEL", (addr_hi_dw >> 24) & 0x7, 3);
-               print_named_value(f, "DATA_SEL", addr_hi_dw >> 29, 3);
-               print_named_value(f, "DATA_LO", ac_ib_get(ib), 32);
-               print_named_value(f, "DATA_HI", ac_ib_get(ib), 32);
-               break;
-       }
-       case PKT3_RELEASE_MEM: {
-               uint32_t event_dw = ac_ib_get(ib);
-               if (ib->chip_class >= GFX10) {
-                       ac_dump_reg(f, ib->chip_class, R_490_RELEASE_MEM_OP, event_dw, ~0u);
-               } else {
-                       ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw,
-                                   S_028A90_EVENT_TYPE(~0));
-                       print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4);
-                       print_named_value(f, "TCL1_VOL_ACTION_ENA", (event_dw >> 12) & 0x1, 1);
-                       print_named_value(f, "TC_VOL_ACTION_ENA", (event_dw >> 13) & 0x1, 1);
-                       print_named_value(f, "TC_WB_ACTION_ENA", (event_dw >> 15) & 0x1, 1);
-                       print_named_value(f, "TCL1_ACTION_ENA", (event_dw >> 16) & 0x1, 1);
-                       print_named_value(f, "TC_ACTION_ENA", (event_dw >> 17) & 0x1, 1);
-                       print_named_value(f, "TC_NC_ACTION_ENA", (event_dw >> 19) & 0x1, 1);
-                       print_named_value(f, "TC_WC_ACTION_ENA", (event_dw >> 20) & 0x1, 1);
-                       print_named_value(f, "TC_MD_ACTION_ENA", (event_dw >> 21) & 0x1, 1);
-               }
-               uint32_t sel_dw = ac_ib_get(ib);
-               print_named_value(f, "DST_SEL", (sel_dw >> 16) & 0x3, 2);
-               print_named_value(f, "INT_SEL", (sel_dw >> 24) & 0x7, 3);
-               print_named_value(f, "DATA_SEL", sel_dw >> 29, 3);
-               print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32);
-               print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 32);
-               print_named_value(f, "DATA_LO", ac_ib_get(ib), 32);
-               print_named_value(f, "DATA_HI", ac_ib_get(ib), 32);
-               print_named_value(f, "CTXID", ac_ib_get(ib), 32);
-               break;
-       }
-       case PKT3_WAIT_REG_MEM:
-               print_named_value(f, "OP", ac_ib_get(ib), 32);
-               print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32);
-               print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 32);
-               print_named_value(f, "REF", ac_ib_get(ib), 32);
-               print_named_value(f, "MASK", ac_ib_get(ib), 32);
-               print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16);
-               break;
-       case PKT3_DRAW_INDEX_AUTO:
-               ac_dump_reg(f, ib->chip_class, R_030930_VGT_NUM_INDICES, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_0287F0_VGT_DRAW_INITIATOR, ac_ib_get(ib), ~0);
-               break;
-       case PKT3_DRAW_INDEX_2:
-               ac_dump_reg(f, ib->chip_class, R_028A78_VGT_DMA_MAX_SIZE, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_0287E8_VGT_DMA_BASE, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_0287E4_VGT_DMA_BASE_HI, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_030930_VGT_NUM_INDICES, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_0287F0_VGT_DRAW_INITIATOR, ac_ib_get(ib), ~0);
-               break;
-       case PKT3_INDEX_TYPE:
-               ac_dump_reg(f, ib->chip_class, R_028A7C_VGT_DMA_INDEX_TYPE, ac_ib_get(ib), ~0);
-               break;
-       case PKT3_NUM_INSTANCES:
-               ac_dump_reg(f, ib->chip_class, R_030934_VGT_NUM_INSTANCES, ac_ib_get(ib), ~0);
-               break;
-       case PKT3_WRITE_DATA:
-               ac_dump_reg(f, ib->chip_class, R_370_CONTROL, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_371_DST_ADDR_LO, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_372_DST_ADDR_HI, ac_ib_get(ib), ~0);
-               /* The payload is written automatically */
-               break;
-       case PKT3_CP_DMA:
-               ac_dump_reg(f, ib->chip_class, R_410_CP_DMA_WORD0, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_411_CP_DMA_WORD1, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_412_CP_DMA_WORD2, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_413_CP_DMA_WORD3, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_414_COMMAND, ac_ib_get(ib), ~0);
-               break;
-       case PKT3_DMA_DATA:
-               ac_dump_reg(f, ib->chip_class, R_500_DMA_DATA_WORD0, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_501_SRC_ADDR_LO, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_502_SRC_ADDR_HI, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_503_DST_ADDR_LO, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_504_DST_ADDR_HI, ac_ib_get(ib), ~0);
-               ac_dump_reg(f, ib->chip_class, R_414_COMMAND, ac_ib_get(ib), ~0);
-               break;
-       case PKT3_INDIRECT_BUFFER_SI:
-       case PKT3_INDIRECT_BUFFER_CONST:
-       case PKT3_INDIRECT_BUFFER_CIK: {
-               uint32_t base_lo_dw = ac_ib_get(ib);
-               ac_dump_reg(f, ib->chip_class, R_3F0_IB_BASE_LO, base_lo_dw, ~0);
-               uint32_t base_hi_dw = ac_ib_get(ib);
-               ac_dump_reg(f, ib->chip_class, R_3F1_IB_BASE_HI, base_hi_dw, ~0);
-               uint32_t control_dw = ac_ib_get(ib);
-               ac_dump_reg(f, ib->chip_class, R_3F2_IB_CONTROL, control_dw, ~0);
-
-               if (!ib->addr_callback)
-                       break;
-
-               uint64_t addr = ((uint64_t)base_hi_dw << 32) | base_lo_dw;
-               void *data = ib->addr_callback(ib->addr_callback_data, addr);
-               if (!data)
-                       break;
-
-               if (G_3F2_CHAIN(control_dw)) {
-                       ib->ib = data;
-                       ib->num_dw = G_3F2_IB_SIZE(control_dw);
-                       ib->cur_dw = 0;
-                       return;
-               }
-
-               struct ac_ib_parser ib_recurse;
-               memcpy(&ib_recurse, ib, sizeof(ib_recurse));
-               ib_recurse.ib = data;
-               ib_recurse.num_dw = G_3F2_IB_SIZE(control_dw);
-               ib_recurse.cur_dw = 0;
-               if(ib_recurse.trace_id_count) {
-                       if (*current_trace_id == *ib->trace_ids) {
-                               ++ib_recurse.trace_ids;
-                               --ib_recurse.trace_id_count;
-                       } else {
-                               ib_recurse.trace_id_count = 0;
-                       }
-               }
-
-               fprintf(f, "\n\035>------------------ nested begin ------------------\n");
-               ac_do_parse_ib(f, &ib_recurse);
-               fprintf(f, "\n\035<------------------- nested end -------------------\n");
-               break;
-       }
-       case PKT3_CLEAR_STATE:
-       case PKT3_INCREMENT_DE_COUNTER:
-       case PKT3_PFP_SYNC_ME:
-               break;
-       case PKT3_NOP:
-               if (header == PKT3_NOP_PAD) {
-                       count = -1; /* One dword NOP. */
-               } else if (count == 0 && ib->cur_dw < ib->num_dw &&
-                          AC_IS_TRACE_POINT(ib->ib[ib->cur_dw])) {
-                       unsigned packet_id = AC_GET_TRACE_POINT_ID(ib->ib[ib->cur_dw]);
-
-                       print_spaces(f, INDENT_PKT);
-                       fprintf(f, COLOR_RED "Trace point ID: %u\n", packet_id);
-
-                       if (!ib->trace_id_count)
-                               break; /* tracing was disabled */
-
-                       *current_trace_id = packet_id;
-
-                       print_spaces(f, INDENT_PKT);
-                       if (packet_id < *ib->trace_ids)
-                               fprintf(f, COLOR_RED
-                                       "This trace point was reached by the CP."
-                                       COLOR_RESET "\n");
-                       else if (packet_id == *ib->trace_ids)
-                               fprintf(f, COLOR_RED
-                                       "!!!!! This is the last trace point that "
-                                       "was reached by the CP !!!!!"
-                                       COLOR_RESET "\n");
-                       else if (packet_id+1 == *ib->trace_ids)
-                               fprintf(f, COLOR_RED
-                                       "!!!!! This is the first trace point that "
-                                       "was NOT been reached by the CP !!!!!"
-                                       COLOR_RESET "\n");
-                       else
-                               fprintf(f, COLOR_RED
-                                       "!!!!! This trace point was NOT reached "
-                                       "by the CP !!!!!"
-                                       COLOR_RESET "\n");
-                       break;
-               }
-               break;
-       }
-
-       /* print additional dwords */
-       while (ib->cur_dw <= first_dw + count)
-               ac_ib_get(ib);
-
-       if (ib->cur_dw > first_dw + count + 1)
-               fprintf(f, COLOR_RED "\n!!!!! count in header too low !!!!!"
-                       COLOR_RESET "\n");
+   unsigned first_dw = ib->cur_dw;
+   int count = PKT_COUNT_G(header);
+   unsigned op = PKT3_IT_OPCODE_G(header);
+   const char *predicate = PKT3_PREDICATE(header) ? "(predicate)" : "";
+   int i;
+
+   /* Print the name first. */
+   for (i = 0; i < ARRAY_SIZE(packet3_table); i++)
+      if (packet3_table[i].op == op)
+         break;
+
+   if (i < ARRAY_SIZE(packet3_table)) {
+      const char *name = sid_strings + packet3_table[i].name_offset;
+
+      if (op == PKT3_SET_CONTEXT_REG || op == PKT3_SET_CONFIG_REG || op == PKT3_SET_UCONFIG_REG ||
+          op == PKT3_SET_UCONFIG_REG_INDEX || op == PKT3_SET_SH_REG)
+         fprintf(f, COLOR_CYAN "%s%s" COLOR_CYAN ":\n", name, predicate);
+      else
+         fprintf(f, COLOR_GREEN "%s%s" COLOR_RESET ":\n", name, predicate);
+   } else
+      fprintf(f, COLOR_RED "PKT3_UNKNOWN 0x%x%s" COLOR_RESET ":\n", op, predicate);
+
+   /* Print the contents. */
+   switch (op) {
+   case PKT3_SET_CONTEXT_REG:
+      ac_parse_set_reg_packet(f, count, SI_CONTEXT_REG_OFFSET, ib);
+      break;
+   case PKT3_SET_CONFIG_REG:
+      ac_parse_set_reg_packet(f, count, SI_CONFIG_REG_OFFSET, ib);
+      break;
+   case PKT3_SET_UCONFIG_REG:
+   case PKT3_SET_UCONFIG_REG_INDEX:
+      ac_parse_set_reg_packet(f, count, CIK_UCONFIG_REG_OFFSET, ib);
+      break;
+   case PKT3_SET_SH_REG:
+      ac_parse_set_reg_packet(f, count, SI_SH_REG_OFFSET, ib);
+      break;
+   case PKT3_ACQUIRE_MEM:
+      ac_dump_reg(f, ib->chip_class, R_0301F0_CP_COHER_CNTL, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_0301F4_CP_COHER_SIZE, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_030230_CP_COHER_SIZE_HI, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_0301F8_CP_COHER_BASE, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_0301E4_CP_COHER_BASE_HI, ac_ib_get(ib), ~0);
+      print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16);
+      if (ib->chip_class >= GFX10)
+         ac_dump_reg(f, ib->chip_class, R_586_GCR_CNTL, ac_ib_get(ib), ~0);
+      break;
+   case PKT3_SURFACE_SYNC:
+      if (ib->chip_class >= GFX7) {
+         ac_dump_reg(f, ib->chip_class, R_0301F0_CP_COHER_CNTL, ac_ib_get(ib), ~0);
+         ac_dump_reg(f, ib->chip_class, R_0301F4_CP_COHER_SIZE, ac_ib_get(ib), ~0);
+         ac_dump_reg(f, ib->chip_class, R_0301F8_CP_COHER_BASE, ac_ib_get(ib), ~0);
+      } else {
+         ac_dump_reg(f, ib->chip_class, R_0085F0_CP_COHER_CNTL, ac_ib_get(ib), ~0);
+         ac_dump_reg(f, ib->chip_class, R_0085F4_CP_COHER_SIZE, ac_ib_get(ib), ~0);
+         ac_dump_reg(f, ib->chip_class, R_0085F8_CP_COHER_BASE, ac_ib_get(ib), ~0);
+      }
+      print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16);
+      break;
+   case PKT3_EVENT_WRITE: {
+      uint32_t event_dw = ac_ib_get(ib);
+      ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw,
+                  S_028A90_EVENT_TYPE(~0));
+      print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4);
+      print_named_value(f, "INV_L2", (event_dw >> 20) & 0x1, 1);
+      if (count > 0) {
+         print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32);
+         print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 16);
+      }
+      break;
+   }
+   case PKT3_EVENT_WRITE_EOP: {
+      uint32_t event_dw = ac_ib_get(ib);
+      ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw,
+                  S_028A90_EVENT_TYPE(~0));
+      print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4);
+      print_named_value(f, "TCL1_VOL_ACTION_ENA", (event_dw >> 12) & 0x1, 1);
+      print_named_value(f, "TC_VOL_ACTION_ENA", (event_dw >> 13) & 0x1, 1);
+      print_named_value(f, "TC_WB_ACTION_ENA", (event_dw >> 15) & 0x1, 1);
+      print_named_value(f, "TCL1_ACTION_ENA", (event_dw >> 16) & 0x1, 1);
+      print_named_value(f, "TC_ACTION_ENA", (event_dw >> 17) & 0x1, 1);
+      print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32);
+      uint32_t addr_hi_dw = ac_ib_get(ib);
+      print_named_value(f, "ADDRESS_HI", addr_hi_dw, 16);
+      print_named_value(f, "DST_SEL", (addr_hi_dw >> 16) & 0x3, 2);
+      print_named_value(f, "INT_SEL", (addr_hi_dw >> 24) & 0x7, 3);
+      print_named_value(f, "DATA_SEL", addr_hi_dw >> 29, 3);
+      print_named_value(f, "DATA_LO", ac_ib_get(ib), 32);
+      print_named_value(f, "DATA_HI", ac_ib_get(ib), 32);
+      break;
+   }
+   case PKT3_RELEASE_MEM: {
+      uint32_t event_dw = ac_ib_get(ib);
+      if (ib->chip_class >= GFX10) {
+         ac_dump_reg(f, ib->chip_class, R_490_RELEASE_MEM_OP, event_dw, ~0u);
+      } else {
+         ac_dump_reg(f, ib->chip_class, R_028A90_VGT_EVENT_INITIATOR, event_dw,
+                     S_028A90_EVENT_TYPE(~0));
+         print_named_value(f, "EVENT_INDEX", (event_dw >> 8) & 0xf, 4);
+         print_named_value(f, "TCL1_VOL_ACTION_ENA", (event_dw >> 12) & 0x1, 1);
+         print_named_value(f, "TC_VOL_ACTION_ENA", (event_dw >> 13) & 0x1, 1);
+         print_named_value(f, "TC_WB_ACTION_ENA", (event_dw >> 15) & 0x1, 1);
+         print_named_value(f, "TCL1_ACTION_ENA", (event_dw >> 16) & 0x1, 1);
+         print_named_value(f, "TC_ACTION_ENA", (event_dw >> 17) & 0x1, 1);
+         print_named_value(f, "TC_NC_ACTION_ENA", (event_dw >> 19) & 0x1, 1);
+         print_named_value(f, "TC_WC_ACTION_ENA", (event_dw >> 20) & 0x1, 1);
+         print_named_value(f, "TC_MD_ACTION_ENA", (event_dw >> 21) & 0x1, 1);
+      }
+      uint32_t sel_dw = ac_ib_get(ib);
+      print_named_value(f, "DST_SEL", (sel_dw >> 16) & 0x3, 2);
+      print_named_value(f, "INT_SEL", (sel_dw >> 24) & 0x7, 3);
+      print_named_value(f, "DATA_SEL", sel_dw >> 29, 3);
+      print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32);
+      print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 32);
+      print_named_value(f, "DATA_LO", ac_ib_get(ib), 32);
+      print_named_value(f, "DATA_HI", ac_ib_get(ib), 32);
+      print_named_value(f, "CTXID", ac_ib_get(ib), 32);
+      break;
+   }
+   case PKT3_WAIT_REG_MEM:
+      print_named_value(f, "OP", ac_ib_get(ib), 32);
+      print_named_value(f, "ADDRESS_LO", ac_ib_get(ib), 32);
+      print_named_value(f, "ADDRESS_HI", ac_ib_get(ib), 32);
+      print_named_value(f, "REF", ac_ib_get(ib), 32);
+      print_named_value(f, "MASK", ac_ib_get(ib), 32);
+      print_named_value(f, "POLL_INTERVAL", ac_ib_get(ib), 16);
+      break;
+   case PKT3_DRAW_INDEX_AUTO:
+      ac_dump_reg(f, ib->chip_class, R_030930_VGT_NUM_INDICES, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_0287F0_VGT_DRAW_INITIATOR, ac_ib_get(ib), ~0);
+      break;
+   case PKT3_DRAW_INDEX_2:
+      ac_dump_reg(f, ib->chip_class, R_028A78_VGT_DMA_MAX_SIZE, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_0287E8_VGT_DMA_BASE, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_0287E4_VGT_DMA_BASE_HI, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_030930_VGT_NUM_INDICES, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_0287F0_VGT_DRAW_INITIATOR, ac_ib_get(ib), ~0);
+      break;
+   case PKT3_INDEX_TYPE:
+      ac_dump_reg(f, ib->chip_class, R_028A7C_VGT_DMA_INDEX_TYPE, ac_ib_get(ib), ~0);
+      break;
+   case PKT3_NUM_INSTANCES:
+      ac_dump_reg(f, ib->chip_class, R_030934_VGT_NUM_INSTANCES, ac_ib_get(ib), ~0);
+      break;
+   case PKT3_WRITE_DATA:
+      ac_dump_reg(f, ib->chip_class, R_370_CONTROL, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_371_DST_ADDR_LO, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_372_DST_ADDR_HI, ac_ib_get(ib), ~0);
+      /* The payload is written automatically */
+      break;
+   case PKT3_CP_DMA:
+      ac_dump_reg(f, ib->chip_class, R_410_CP_DMA_WORD0, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_411_CP_DMA_WORD1, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_412_CP_DMA_WORD2, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_413_CP_DMA_WORD3, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_414_COMMAND, ac_ib_get(ib), ~0);
+      break;
+   case PKT3_DMA_DATA:
+      ac_dump_reg(f, ib->chip_class, R_500_DMA_DATA_WORD0, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_501_SRC_ADDR_LO, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_502_SRC_ADDR_HI, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_503_DST_ADDR_LO, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_504_DST_ADDR_HI, ac_ib_get(ib), ~0);
+      ac_dump_reg(f, ib->chip_class, R_414_COMMAND, ac_ib_get(ib), ~0);
+      break;
+   case PKT3_INDIRECT_BUFFER_SI:
+   case PKT3_INDIRECT_BUFFER_CONST:
+   case PKT3_INDIRECT_BUFFER_CIK: {
+      uint32_t base_lo_dw = ac_ib_get(ib);
+      ac_dump_reg(f, ib->chip_class, R_3F0_IB_BASE_LO, base_lo_dw, ~0);
+      uint32_t base_hi_dw = ac_ib_get(ib);
+      ac_dump_reg(f, ib->chip_class, R_3F1_IB_BASE_HI, base_hi_dw, ~0);
+      uint32_t control_dw = ac_ib_get(ib);
+      ac_dump_reg(f, ib->chip_class, R_3F2_IB_CONTROL, control_dw, ~0);
+
+      if (!ib->addr_callback)
+         break;
+
+      uint64_t addr = ((uint64_t)base_hi_dw << 32) | base_lo_dw;
+      void *data = ib->addr_callback(ib->addr_callback_data, addr);
+      if (!data)
+         break;
+
+      if (G_3F2_CHAIN(control_dw)) {
+         ib->ib = data;
+         ib->num_dw = G_3F2_IB_SIZE(control_dw);
+         ib->cur_dw = 0;
+         return;
+      }
+
+      struct ac_ib_parser ib_recurse;
+      memcpy(&ib_recurse, ib, sizeof(ib_recurse));
+      ib_recurse.ib = data;
+      ib_recurse.num_dw = G_3F2_IB_SIZE(control_dw);
+      ib_recurse.cur_dw = 0;
+      if (ib_recurse.trace_id_count) {
+         if (*current_trace_id == *ib->trace_ids) {
+            ++ib_recurse.trace_ids;
+            --ib_recurse.trace_id_count;
+         } else {
+            ib_recurse.trace_id_count = 0;
+         }
+      }
+
+      fprintf(f, "\n\035>------------------ nested begin ------------------\n");
+      ac_do_parse_ib(f, &ib_recurse);
+      fprintf(f, "\n\035<------------------- nested end -------------------\n");
+      break;
+   }
+   case PKT3_CLEAR_STATE:
+   case PKT3_INCREMENT_DE_COUNTER:
+   case PKT3_PFP_SYNC_ME:
+      break;
+   case PKT3_NOP:
+      if (header == PKT3_NOP_PAD) {
+         count = -1; /* One dword NOP. */
+      } else if (count == 0 && ib->cur_dw < ib->num_dw && AC_IS_TRACE_POINT(ib->ib[ib->cur_dw])) {
+         unsigned packet_id = AC_GET_TRACE_POINT_ID(ib->ib[ib->cur_dw]);
+
+         print_spaces(f, INDENT_PKT);
+         fprintf(f, COLOR_RED "Trace point ID: %u\n", packet_id);
+
+         if (!ib->trace_id_count)
+            break; /* tracing was disabled */
+
+         *current_trace_id = packet_id;
+
+         print_spaces(f, INDENT_PKT);
+         if (packet_id < *ib->trace_ids)
+            fprintf(f, COLOR_RED "This trace point was reached by the CP." COLOR_RESET "\n");
+         else if (packet_id == *ib->trace_ids)
+            fprintf(f, COLOR_RED "!!!!! This is the last trace point that "
+                                 "was reached by the CP !!!!!" COLOR_RESET "\n");
+         else if (packet_id + 1 == *ib->trace_ids)
+            fprintf(f, COLOR_RED "!!!!! This is the first trace point that "
+                                 "was NOT been reached by the CP !!!!!" COLOR_RESET "\n");
+         else
+            fprintf(f, COLOR_RED "!!!!! This trace point was NOT reached "
+                                 "by the CP !!!!!" COLOR_RESET "\n");
+         break;
+      }
+      break;
+   }
+
+   /* print additional dwords */
+   while (ib->cur_dw <= first_dw + count)
+      ac_ib_get(ib);
+
+   if (ib->cur_dw > first_dw + count + 1)
+      fprintf(f, COLOR_RED "\n!!!!! count in header too low !!!!!" COLOR_RESET "\n");
 }
 
 /**
@@ -517,65 +494,65 @@ static void ac_parse_packet3(FILE *f, uint32_t header, struct ac_ib_parser *ib,
  */
 static void ac_do_parse_ib(FILE *f, struct ac_ib_parser *ib)
 {
-       int current_trace_id = -1;
-
-       while (ib->cur_dw < ib->num_dw) {
-               uint32_t header = ac_ib_get(ib);
-               unsigned type = PKT_TYPE_G(header);
-
-               switch (type) {
-               case 3:
-                       ac_parse_packet3(f, header, ib, &current_trace_id);
-                       break;
-               case 2:
-                       /* type-2 nop */
-                       if (header == 0x80000000) {
-                               fprintf(f, COLOR_GREEN "NOP (type 2)" COLOR_RESET "\n");
-                               break;
-                       }
-                       /* fall through */
-               default:
-                       fprintf(f, "Unknown packet type %i\n", type);
-                       break;
-               }
-       }
+   int current_trace_id = -1;
+
+   while (ib->cur_dw < ib->num_dw) {
+      uint32_t header = ac_ib_get(ib);
+      unsigned type = PKT_TYPE_G(header);
+
+      switch (type) {
+      case 3:
+         ac_parse_packet3(f, header, ib, &current_trace_id);
+         break;
+      case 2:
+         /* type-2 nop */
+         if (header == 0x80000000) {
+            fprintf(f, COLOR_GREEN "NOP (type 2)" COLOR_RESET "\n");
+            break;
+         }
+         /* fall through */
+      default:
+         fprintf(f, "Unknown packet type %i\n", type);
+         break;
+      }
+   }
 }
 
 static void format_ib_output(FILE *f, char *out)
 {
-       unsigned depth = 0;
+   unsigned depth = 0;
 
-       for (;;) {
-               char op = 0;
+   for (;;) {
+      char op = 0;
 
-               if (out[0] == '\n' && out[1] == '\035')
-                       out++;
-               if (out[0] == '\035') {
-                       op = out[1];
-                       out += 2;
-               }
+      if (out[0] == '\n' && out[1] == '\035')
+         out++;
+      if (out[0] == '\035') {
+         op = out[1];
+         out += 2;
+      }
 
-               if (op == '<')
-                       depth--;
+      if (op == '<')
+         depth--;
 
-               unsigned indent = 4 * depth;
-               if (op != '#')
-                       indent += 9;
+      unsigned indent = 4 * depth;
+      if (op != '#')
+         indent += 9;
 
-               if (indent)
-                       print_spaces(f, indent);
+      if (indent)
+         print_spaces(f, indent);
 
-               char *end = strchrnul(out, '\n');
-               fwrite(out, end - out, 1, f);
-               fputc('\n', f); /* always end with a new line */
-               if (!*end)
-                       break;
+      char *end = strchrnul(out, '\n');
+      fwrite(out, end - out, 1, f);
+      fputc('\n', f); /* always end with a new line */
+      if (!*end)
+         break;
 
-               out = end + 1;
+      out = end + 1;
 
-               if (op == '>')
-                       depth++;
-       }
+      if (op == '>')
+         depth++;
+   }
 }
 
 /**
@@ -593,34 +570,34 @@ static void format_ib_output(FILE *f, char *out)
  * \param addr_callback_data user data for addr_callback
  */
 void ac_parse_ib_chunk(FILE *f, uint32_t *ib_ptr, int num_dw, const int *trace_ids,
-                      unsigned trace_id_count, enum chip_class chip_class,
+                       unsigned trace_id_count, enum chip_class chip_class,
                        ac_debug_addr_callback addr_callback, void *addr_callback_data)
 {
-       struct ac_ib_parser ib = {};
-       ib.ib = ib_ptr;
-       ib.num_dw = num_dw;
-       ib.trace_ids = trace_ids;
-       ib.trace_id_count = trace_id_count;
-       ib.chip_class = chip_class;
-       ib.addr_callback = addr_callback;
-       ib.addr_callback_data = addr_callback_data;
-
-       char *out;
-       size_t outsize;
-       FILE *memf = open_memstream(&out, &outsize);
-       ib.f = memf;
-       ac_do_parse_ib(memf, &ib);
-       fclose(memf);
-
-       if (out) {
-               format_ib_output(f, out);
-               free(out);
-       }
-
-       if (ib.cur_dw > ib.num_dw) {
-               printf("\nPacket ends after the end of IB.\n");
-               exit(1);
-       }
+   struct ac_ib_parser ib = {};
+   ib.ib = ib_ptr;
+   ib.num_dw = num_dw;
+   ib.trace_ids = trace_ids;
+   ib.trace_id_count = trace_id_count;
+   ib.chip_class = chip_class;
+   ib.addr_callback = addr_callback;
+   ib.addr_callback_data = addr_callback_data;
+
+   char *out;
+   size_t outsize;
+   FILE *memf = open_memstream(&out, &outsize);
+   ib.f = memf;
+   ac_do_parse_ib(memf, &ib);
+   fclose(memf);
+
+   if (out) {
+      format_ib_output(f, out);
+      free(out);
+   }
+
+   if (ib.cur_dw > ib.num_dw) {
+      printf("\nPacket ends after the end of IB.\n");
+      exit(1);
+   }
 }
 
 /**
@@ -637,17 +614,16 @@ void ac_parse_ib_chunk(FILE *f, uint32_t *ib_ptr, int num_dw, const int *trace_i
  *                      be NULL.
  * \param addr_callback_data user data for addr_callback
  */
-void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids,
-                unsigned trace_id_count, const char *name,
-                enum chip_class chip_class, ac_debug_addr_callback addr_callback,
-                void *addr_callback_data)
+void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids, unsigned trace_id_count,
+                 const char *name, enum chip_class chip_class, ac_debug_addr_callback addr_callback,
+                 void *addr_callback_data)
 {
-       fprintf(f, "------------------ %s begin ------------------\n", name);
+   fprintf(f, "------------------ %s begin ------------------\n", name);
 
-       ac_parse_ib_chunk(f, ib, num_dw, trace_ids, trace_id_count,
-                         chip_class, addr_callback,  addr_callback_data);
+   ac_parse_ib_chunk(f, ib, num_dw, trace_ids, trace_id_count, chip_class, addr_callback,
+                     addr_callback_data);
 
-       fprintf(f, "------------------- %s end -------------------\n\n", name);
+   fprintf(f, "------------------- %s end -------------------\n\n", name);
 }
 
 /**
@@ -657,179 +633,176 @@ void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids,
  * \param old_dmesg_timestamp  previous dmesg timestamp parsed at init time
  * \param out_addr             detected VM fault addr
  */
-bool ac_vm_fault_occured(enum chip_class chip_class,
-                        uint64_t *old_dmesg_timestamp, uint64_t *out_addr)
+bool ac_vm_fault_occured(enum chip_class chip_class, uint64_t *old_dmesg_timestamp,
+                         uint64_t *out_addr)
 {
-       char line[2000];
-       unsigned sec, usec;
-       int progress = 0;
-       uint64_t dmesg_timestamp = 0;
-       bool fault = false;
-
-       FILE *p = popen("dmesg", "r");
-       if (!p)
-               return false;
-
-       while (fgets(line, sizeof(line), p)) {
-               char *msg, len;
-
-               if (!line[0] || line[0] == '\n')
-                       continue;
-
-               /* Get the timestamp. */
-               if (sscanf(line, "[%u.%u]", &sec, &usec) != 2) {
-                       static bool hit = false;
-                       if (!hit) {
-                               fprintf(stderr, "%s: failed to parse line '%s'\n",
-                                       __func__, line);
-                               hit = true;
-                       }
-                       continue;
-               }
-               dmesg_timestamp = sec * 1000000ull + usec;
-
-               /* If just updating the timestamp. */
-               if (!out_addr)
-                       continue;
-
-               /* Process messages only if the timestamp is newer. */
-               if (dmesg_timestamp <= *old_dmesg_timestamp)
-                       continue;
-
-               /* Only process the first VM fault. */
-               if (fault)
-                       continue;
-
-               /* Remove trailing \n */
-               len = strlen(line);
-               if (len && line[len-1] == '\n')
-                       line[len-1] = 0;
-
-               /* Get the message part. */
-               msg = strchr(line, ']');
-               if (!msg)
-                       continue;
-               msg++;
-
-               const char *header_line, *addr_line_prefix, *addr_line_format;
-
-               if (chip_class >= GFX9) {
-                       /* Match this:
-                        * ..: [gfxhub] VMC page fault (src_id:0 ring:158 vm_id:2 pas_id:0)
-                        * ..:   at page 0x0000000219f8f000 from 27
-                        * ..: VM_L2_PROTECTION_FAULT_STATUS:0x0020113C
-                        */
-                       header_line = "VMC page fault";
-                       addr_line_prefix = "   at page";
-                       addr_line_format = "%"PRIx64;
-               } else {
-                       header_line = "GPU fault detected:";
-                       addr_line_prefix = "VM_CONTEXT1_PROTECTION_FAULT_ADDR";
-                       addr_line_format = "%"PRIX64;
-               }
-
-               switch (progress) {
-               case 0:
-                       if (strstr(msg, header_line))
-                               progress = 1;
-                       break;
-               case 1:
-                       msg = strstr(msg, addr_line_prefix);
-                       if (msg) {
-                               msg = strstr(msg, "0x");
-                               if (msg) {
-                                       msg += 2;
-                                       if (sscanf(msg, addr_line_format, out_addr) == 1)
-                                               fault = true;
-                               }
-                       }
-                       progress = 0;
-                       break;
-               default:
-                       progress = 0;
-               }
-       }
-       pclose(p);
-
-       if (dmesg_timestamp > *old_dmesg_timestamp)
-               *old_dmesg_timestamp = dmesg_timestamp;
-
-       return fault;
+   char line[2000];
+   unsigned sec, usec;
+   int progress = 0;
+   uint64_t dmesg_timestamp = 0;
+   bool fault = false;
+
+   FILE *p = popen("dmesg", "r");
+   if (!p)
+      return false;
+
+   while (fgets(line, sizeof(line), p)) {
+      char *msg, len;
+
+      if (!line[0] || line[0] == '\n')
+         continue;
+
+      /* Get the timestamp. */
+      if (sscanf(line, "[%u.%u]", &sec, &usec) != 2) {
+         static bool hit = false;
+         if (!hit) {
+            fprintf(stderr, "%s: failed to parse line '%s'\n", __func__, line);
+            hit = true;
+         }
+         continue;
+      }
+      dmesg_timestamp = sec * 1000000ull + usec;
+
+      /* If just updating the timestamp. */
+      if (!out_addr)
+         continue;
+
+      /* Process messages only if the timestamp is newer. */
+      if (dmesg_timestamp <= *old_dmesg_timestamp)
+         continue;
+
+      /* Only process the first VM fault. */
+      if (fault)
+         continue;
+
+      /* Remove trailing \n */
+      len = strlen(line);
+      if (len && line[len - 1] == '\n')
+         line[len - 1] = 0;
+
+      /* Get the message part. */
+      msg = strchr(line, ']');
+      if (!msg)
+         continue;
+      msg++;
+
+      const char *header_line, *addr_line_prefix, *addr_line_format;
+
+      if (chip_class >= GFX9) {
+         /* Match this:
+          * ..: [gfxhub] VMC page fault (src_id:0 ring:158 vm_id:2 pas_id:0)
+          * ..:   at page 0x0000000219f8f000 from 27
+          * ..: VM_L2_PROTECTION_FAULT_STATUS:0x0020113C
+          */
+         header_line = "VMC page fault";
+         addr_line_prefix = "   at page";
+         addr_line_format = "%" PRIx64;
+      } else {
+         header_line = "GPU fault detected:";
+         addr_line_prefix = "VM_CONTEXT1_PROTECTION_FAULT_ADDR";
+         addr_line_format = "%" PRIX64;
+      }
+
+      switch (progress) {
+      case 0:
+         if (strstr(msg, header_line))
+            progress = 1;
+         break;
+      case 1:
+         msg = strstr(msg, addr_line_prefix);
+         if (msg) {
+            msg = strstr(msg, "0x");
+            if (msg) {
+               msg += 2;
+               if (sscanf(msg, addr_line_format, out_addr) == 1)
+                  fault = true;
+            }
+         }
+         progress = 0;
+         break;
+      default:
+         progress = 0;
+      }
+   }
+   pclose(p);
+
+   if (dmesg_timestamp > *old_dmesg_timestamp)
+      *old_dmesg_timestamp = dmesg_timestamp;
+
+   return fault;
 }
 
 static int compare_wave(const void *p1, const void *p2)
 {
-       struct ac_wave_info *w1 = (struct ac_wave_info *)p1;
-       struct ac_wave_info *w2 = (struct ac_wave_info *)p2;
-
-       /* Sort waves according to PC and then SE, SH, CU, etc. */
-       if (w1->pc < w2->pc)
-               return -1;
-       if (w1->pc > w2->pc)
-               return 1;
-       if (w1->se < w2->se)
-               return -1;
-       if (w1->se > w2->se)
-               return 1;
-       if (w1->sh < w2->sh)
-               return -1;
-       if (w1->sh > w2->sh)
-               return 1;
-       if (w1->cu < w2->cu)
-               return -1;
-       if (w1->cu > w2->cu)
-               return 1;
-       if (w1->simd < w2->simd)
-               return -1;
-       if (w1->simd > w2->simd)
-               return 1;
-       if (w1->wave < w2->wave)
-               return -1;
-       if (w1->wave > w2->wave)
-               return 1;
-
-       return 0;
+   struct ac_wave_info *w1 = (struct ac_wave_info *)p1;
+   struct ac_wave_info *w2 = (struct ac_wave_info *)p2;
+
+   /* Sort waves according to PC and then SE, SH, CU, etc. */
+   if (w1->pc < w2->pc)
+      return -1;
+   if (w1->pc > w2->pc)
+      return 1;
+   if (w1->se < w2->se)
+      return -1;
+   if (w1->se > w2->se)
+      return 1;
+   if (w1->sh < w2->sh)
+      return -1;
+   if (w1->sh > w2->sh)
+      return 1;
+   if (w1->cu < w2->cu)
+      return -1;
+   if (w1->cu > w2->cu)
+      return 1;
+   if (w1->simd < w2->simd)
+      return -1;
+   if (w1->simd > w2->simd)
+      return 1;
+   if (w1->wave < w2->wave)
+      return -1;
+   if (w1->wave > w2->wave)
+      return 1;
+
+   return 0;
 }
 
 /* Return wave information. "waves" should be a large enough array. */
 unsigned ac_get_wave_info(enum chip_class chip_class,
-                         struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP])
+                          struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP])
 {
-       char line[2000], cmd[128];
-       unsigned num_waves = 0;
-
-       sprintf(cmd, "umr -O halt_waves -wa %s", chip_class >= GFX10 ? "gfx_0.0.0" : "gfx");
-
-       FILE *p = popen(cmd, "r");
-       if (!p)
-               return 0;
-
-       if (!fgets(line, sizeof(line), p) ||
-           strncmp(line, "SE", 2) != 0) {
-               pclose(p);
-               return 0;
-       }
-
-       while (fgets(line, sizeof(line), p)) {
-               struct ac_wave_info *w;
-               uint32_t pc_hi, pc_lo, exec_hi, exec_lo;
-
-               assert(num_waves < AC_MAX_WAVES_PER_CHIP);
-               w = &waves[num_waves];
-
-               if (sscanf(line, "%u %u %u %u %u %x %x %x %x %x %x %x",
-                          &w->se, &w->sh, &w->cu, &w->simd, &w->wave,
-                          &w->status, &pc_hi, &pc_lo, &w->inst_dw0,
-                          &w->inst_dw1, &exec_hi, &exec_lo) == 12) {
-                       w->pc = ((uint64_t)pc_hi << 32) | pc_lo;
-                       w->exec = ((uint64_t)exec_hi << 32) | exec_lo;
-                       w->matched = false;
-                       num_waves++;
-               }
-       }
-
-       qsort(waves, num_waves, sizeof(struct ac_wave_info), compare_wave);
-
-       pclose(p);
-       return num_waves;
+   char line[2000], cmd[128];
+   unsigned num_waves = 0;
+
+   sprintf(cmd, "umr -O halt_waves -wa %s", chip_class >= GFX10 ? "gfx_0.0.0" : "gfx");
+
+   FILE *p = popen(cmd, "r");
+   if (!p)
+      return 0;
+
+   if (!fgets(line, sizeof(line), p) || strncmp(line, "SE", 2) != 0) {
+      pclose(p);
+      return 0;
+   }
+
+   while (fgets(line, sizeof(line), p)) {
+      struct ac_wave_info *w;
+      uint32_t pc_hi, pc_lo, exec_hi, exec_lo;
+
+      assert(num_waves < AC_MAX_WAVES_PER_CHIP);
+      w = &waves[num_waves];
+
+      if (sscanf(line, "%u %u %u %u %u %x %x %x %x %x %x %x", &w->se, &w->sh, &w->cu, &w->simd,
+                 &w->wave, &w->status, &pc_hi, &pc_lo, &w->inst_dw0, &w->inst_dw1, &exec_hi,
+                 &exec_lo) == 12) {
+         w->pc = ((uint64_t)pc_hi << 32) | pc_lo;
+         w->exec = ((uint64_t)exec_hi << 32) | exec_lo;
+         w->matched = false;
+         num_waves++;
+      }
+   }
+
+   qsort(waves, num_waves, sizeof(struct ac_wave_info), compare_wave);
+
+   pclose(p);
+   return num_waves;
 }
index e66abb940c300f12a216eb67a73f7e87f3c9e3ce..72441f7d6ccb15b80d4da768705f766197cdd6ea 100644 (file)
 #ifndef AC_DEBUG_H
 #define AC_DEBUG_H
 
+#include "amd_family.h"
+
+#include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
-#include <stdbool.h>
-
-#include "amd_family.h"
 
-#define AC_ENCODE_TRACE_POINT(id)       (0xcafe0000 | ((id) & 0xffff))
-#define AC_IS_TRACE_POINT(x)            (((x) & 0xcafe0000) == 0xcafe0000)
-#define AC_GET_TRACE_POINT_ID(x)        ((x) & 0xffff)
+#define AC_ENCODE_TRACE_POINT(id) (0xcafe0000 | ((id)&0xffff))
+#define AC_IS_TRACE_POINT(x)      (((x)&0xcafe0000) == 0xcafe0000)
+#define AC_GET_TRACE_POINT_ID(x)  ((x)&0xffff)
 
 #define AC_MAX_WAVES_PER_CHIP (64 * 40)
 
@@ -41,36 +41,36 @@ extern "C" {
 #endif
 
 struct ac_wave_info {
-       unsigned se; /* shader engine */
-       unsigned sh; /* shader array */
-       unsigned cu; /* compute unit */
-       unsigned simd;
-       unsigned wave;
-       uint32_t status;
-       uint64_t pc; /* program counter */
-       uint32_t inst_dw0;
-       uint32_t inst_dw1;
-       uint64_t exec;
-       bool matched; /* whether the wave is used by a currently-bound shader */
+   unsigned se; /* shader engine */
+   unsigned sh; /* shader array */
+   unsigned cu; /* compute unit */
+   unsigned simd;
+   unsigned wave;
+   uint32_t status;
+   uint64_t pc; /* program counter */
+   uint32_t inst_dw0;
+   uint32_t inst_dw1;
+   uint64_t exec;
+   bool matched; /* whether the wave is used by a currently-bound shader */
 };
 
 typedef void *(*ac_debug_addr_callback)(void *data, uint64_t addr);
 
 const char *ac_get_register_name(enum chip_class chip_class, unsigned offset);
-void ac_dump_reg(FILE *file, enum chip_class chip_class, unsigned offset,
-                uint32_t value, uint32_t field_mask);
+void ac_dump_reg(FILE *file, enum chip_class chip_class, unsigned offset, uint32_t value,
+                 uint32_t field_mask);
 void ac_parse_ib_chunk(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids,
-                      unsigned trace_id_count, enum chip_class chip_class,
-                      ac_debug_addr_callback addr_callback, void *addr_callback_data);
-void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids,
-                unsigned trace_id_count, const char *name, enum chip_class chip_class,
-                ac_debug_addr_callback addr_callback, void *addr_callback_data);
+                       unsigned trace_id_count, enum chip_class chip_class,
+                       ac_debug_addr_callback addr_callback, void *addr_callback_data);
+void ac_parse_ib(FILE *f, uint32_t *ib, int num_dw, const int *trace_ids, unsigned trace_id_count,
+                 const char *name, enum chip_class chip_class, ac_debug_addr_callback addr_callback,
+                 void *addr_callback_data);
 
-bool ac_vm_fault_occured(enum chip_class chip_class,
-                        uint64_t *old_dmesg_timestamp, uint64_t *out_addr);
+bool ac_vm_fault_occured(enum chip_class chip_class, uint64_t *old_dmesg_timestamp,
+                         uint64_t *out_addr);
 
 unsigned ac_get_wave_info(enum chip_class chip_class,
-                         struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP]);
+                          struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP]);
 
 #ifdef __cplusplus
 }
index b97ce8154e0203c6ddd9e44a6af63fb68e3919a2..ac8018c0b39ea69c42257f706535e4ad17933a04 100644 (file)
 #ifndef AC_EXP_PARAM_H
 #define AC_EXP_PARAM_H
 
-enum {
-       /* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */
-       AC_EXP_PARAM_OFFSET_0 = 0,
-       AC_EXP_PARAM_OFFSET_31 = 31,
-       /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */
-       AC_EXP_PARAM_DEFAULT_VAL_0000 = 64,
-       AC_EXP_PARAM_DEFAULT_VAL_0001,
-       AC_EXP_PARAM_DEFAULT_VAL_1110,
-       AC_EXP_PARAM_DEFAULT_VAL_1111,
-       AC_EXP_PARAM_UNDEFINED = 255,
+enum
+{
+   /* SPI_PS_INPUT_CNTL_i.OFFSET[0:4] */
+   AC_EXP_PARAM_OFFSET_0 = 0,
+   AC_EXP_PARAM_OFFSET_31 = 31,
+   /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL[0:1] */
+   AC_EXP_PARAM_DEFAULT_VAL_0000 = 64,
+   AC_EXP_PARAM_DEFAULT_VAL_0001,
+   AC_EXP_PARAM_DEFAULT_VAL_1110,
+   AC_EXP_PARAM_DEFAULT_VAL_1111,
+   AC_EXP_PARAM_UNDEFINED = 255,
 };
 
 #endif
index e6ed816f74c4f58fb5a5427ce36f577bbb7f48fd..770737a7ed42ff74025f81b1331a7f81efdd730c 100644 (file)
  */
 
 #include "ac_gpu_info.h"
+
 #include "addrlib/src/amdgpu_asic_addr.h"
+#include "drm-uapi/amdgpu_drm.h"
 #include "sid.h"
-
 #include "util/macros.h"
 #include "util/u_math.h"
 
+#include <amdgpu.h>
 #include <stdio.h>
-
 #include <xf86drm.h>
-#include "drm-uapi/amdgpu_drm.h"
 
-#include <amdgpu.h>
-
-#define CIK_TILE_MODE_COLOR_2D                 14
-
-#define CIK__GB_TILE_MODE__PIPE_CONFIG(x)        (((x) >> 6) & 0x1f)
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P2               0
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16          4
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16         5
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32         6
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32         7
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16    8
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16    9
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16    10
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16   11
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16   12
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32   13
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32   14
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16   16
-#define     CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16  17
+#define CIK_TILE_MODE_COLOR_2D 14
+
+#define CIK__GB_TILE_MODE__PIPE_CONFIG(x)           (((x) >> 6) & 0x1f)
+#define CIK__PIPE_CONFIG__ADDR_SURF_P2              0
+#define CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16         4
+#define CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16        5
+#define CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32        6
+#define CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32        7
+#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16   8
+#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16   9
+#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16   10
+#define CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_16x16  11
+#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16  12
+#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32  13
+#define CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32  14
+#define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16  16
+#define CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16 17
 
 static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info)
 {
@@ -61,12 +59,12 @@ static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info)
 
    switch (CIK__GB_TILE_MODE__PIPE_CONFIG(mode2d)) {
    case CIK__PIPE_CONFIG__ADDR_SURF_P2:
-       return 2;
+      return 2;
    case CIK__PIPE_CONFIG__ADDR_SURF_P4_8x16:
    case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x16:
    case CIK__PIPE_CONFIG__ADDR_SURF_P4_16x32:
    case CIK__PIPE_CONFIG__ADDR_SURF_P4_32x32:
-       return 4;
+      return 4;
    case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x16_8x16:
    case CIK__PIPE_CONFIG__ADDR_SURF_P8_16x32_8x16:
    case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_8x16:
@@ -74,1239 +72,1162 @@ static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info)
    case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x16:
    case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x32_16x32:
    case CIK__PIPE_CONFIG__ADDR_SURF_P8_32x64_32x32:
-       return 8;
+      return 8;
    case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_8X16:
    case CIK__PIPE_CONFIG__ADDR_SURF_P16_32X32_16X16:
-       return 16;
+      return 16;
    default:
-       fprintf(stderr, "Invalid GFX7 pipe configuration, assuming P2\n");
-       assert(!"this should never occur");
-       return 2;
+      fprintf(stderr, "Invalid GFX7 pipe configuration, assuming P2\n");
+      assert(!"this should never occur");
+      return 2;
    }
 }
 
 static bool has_syncobj(int fd)
 {
-       uint64_t value;
-       if (drmGetCap(fd, DRM_CAP_SYNCOBJ, &value))
-               return false;
-       return value ? true : false;
+   uint64_t value;
+   if (drmGetCap(fd, DRM_CAP_SYNCOBJ, &value))
+      return false;
+   return value ? true : false;
 }
 
 static bool has_timeline_syncobj(int fd)
 {
-       uint64_t value;
-       if (drmGetCap(fd, DRM_CAP_SYNCOBJ_TIMELINE, &value))
-               return false;
-       return value ? true : false;
+   uint64_t value;
+   if (drmGetCap(fd, DRM_CAP_SYNCOBJ_TIMELINE, &value))
+      return false;
+   return value ? true : false;
 }
 
 static uint64_t fix_vram_size(uint64_t size)
 {
-       /* The VRAM size is underreported, so we need to fix it, because
-        * it's used to compute the number of memory modules for harvesting.
-        */
-       return align64(size, 256*1024*1024);
+   /* The VRAM size is underreported, so we need to fix it, because
+    * it's used to compute the number of memory modules for harvesting.
+    */
+   return align64(size, 256 * 1024 * 1024);
 }
 
-static uint32_t
-get_l2_cache_size(enum radeon_family family)
+static uint32_t get_l2_cache_size(enum radeon_family family)
 {
-       switch (family) {
-       case CHIP_KABINI:
-       case CHIP_STONEY:
-               return 128 * 1024;
-       case CHIP_OLAND:
-       case CHIP_HAINAN:
-       case CHIP_ICELAND:
-               return 256 * 1024;
-       case CHIP_PITCAIRN:
-       case CHIP_VERDE:
-       case CHIP_BONAIRE:
-       case CHIP_KAVERI:
-       case CHIP_POLARIS12:
-       case CHIP_CARRIZO:
-               return 512 * 1024;
-       case CHIP_TAHITI:
-       case CHIP_TONGA:
-               return 768 * 1024;
-               break;
-       case CHIP_HAWAII:
-       case CHIP_POLARIS11:
-               return 1024 * 1024;
-       case CHIP_FIJI:
-       case CHIP_POLARIS10:
-               return 2048 * 1024;
-               break;
-       default:
-               return 4096 * 1024;
-       }
+   switch (family) {
+   case CHIP_KABINI:
+   case CHIP_STONEY:
+      return 128 * 1024;
+   case CHIP_OLAND:
+   case CHIP_HAINAN:
+   case CHIP_ICELAND:
+      return 256 * 1024;
+   case CHIP_PITCAIRN:
+   case CHIP_VERDE:
+   case CHIP_BONAIRE:
+   case CHIP_KAVERI:
+   case CHIP_POLARIS12:
+   case CHIP_CARRIZO:
+      return 512 * 1024;
+   case CHIP_TAHITI:
+   case CHIP_TONGA:
+      return 768 * 1024;
+      break;
+   case CHIP_HAWAII:
+   case CHIP_POLARIS11:
+      return 1024 * 1024;
+   case CHIP_FIJI:
+   case CHIP_POLARIS10:
+      return 2048 * 1024;
+      break;
+   default:
+      return 4096 * 1024;
+   }
 }
 
-bool ac_query_gpu_info(int fd, void *dev_p,
-                      struct radeon_info *info,
-                      struct amdgpu_gpu_info *amdinfo)
+bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
+                       struct amdgpu_gpu_info *amdinfo)
 {
-       struct drm_amdgpu_info_device device_info = {};
-       struct amdgpu_buffer_size_alignments alignment_info = {};
-       struct drm_amdgpu_info_hw_ip dma = {}, compute = {}, uvd = {};
-       struct drm_amdgpu_info_hw_ip uvd_enc = {}, vce = {}, vcn_dec = {}, vcn_jpeg = {};
-       struct drm_amdgpu_info_hw_ip vcn_enc = {}, gfx = {};
-       struct amdgpu_gds_resource_info gds = {};
-       uint32_t vce_version = 0, vce_feature = 0, uvd_version = 0, uvd_feature = 0;
-       int r, i, j;
-       amdgpu_device_handle dev = dev_p;
-       drmDevicePtr devinfo;
-
-       /* Get PCI info. */
-       r = drmGetDevice2(fd, 0, &devinfo);
-       if (r) {
-               fprintf(stderr, "amdgpu: drmGetDevice2 failed.\n");
-               return false;
-       }
-       info->pci_domain = devinfo->businfo.pci->domain;
-       info->pci_bus = devinfo->businfo.pci->bus;
-       info->pci_dev = devinfo->businfo.pci->dev;
-       info->pci_func = devinfo->businfo.pci->func;
-       drmFreeDevice(&devinfo);
-
-       assert(info->drm_major == 3);
-       info->is_amdgpu = true;
-
-       /* Query hardware and driver information. */
-       r = amdgpu_query_gpu_info(dev, amdinfo);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_gpu_info failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_info(dev, AMDGPU_INFO_DEV_INFO, sizeof(device_info),
-                             &device_info);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_info(dev_info) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_buffer_size_alignment(dev, &alignment_info);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_DMA, 0, &dma);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(dma) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_GFX, 0, &gfx);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(gfx) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_COMPUTE, 0, &compute);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(compute) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_UVD, 0, &uvd);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd) failed.\n");
-               return false;
-       }
-
-       if (info->drm_minor >= 17) {
-               r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_UVD_ENC, 0, &uvd_enc);
-               if (r) {
-                       fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd_enc) failed.\n");
-                       return false;
-               }
-       }
-
-       if (info->drm_minor >= 17) {
-               r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_DEC, 0, &vcn_dec);
-               if (r) {
-                       fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_dec) failed.\n");
-                       return false;
-               }
-       }
-
-       if (info->drm_minor >= 17) {
-               r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_ENC, 0, &vcn_enc);
-               if (r) {
-                       fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_enc) failed.\n");
-                       return false;
-               }
-       }
-
-       if (info->drm_minor >= 27) {
-               r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_JPEG, 0, &vcn_jpeg);
-               if (r) {
-                       fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_jpeg) failed.\n");
-                       return false;
-               }
-       }
-
-       r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_ME, 0, 0,
-                                       &info->me_fw_version,
-                                       &info->me_fw_feature);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(me) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_PFP, 0, 0,
-                                       &info->pfp_fw_version,
-                                       &info->pfp_fw_feature);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(pfp) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_CE, 0, 0,
-                                       &info->ce_fw_version,
-                                       &info->ce_fw_feature);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(ce) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_UVD, 0, 0,
-                                       &uvd_version, &uvd_feature);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(uvd) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCE, 0, &vce);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vce) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_VCE, 0, 0,
-                                       &vce_version, &vce_feature);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(vce) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_sw_info(dev, amdgpu_sw_info_address32_hi, &info->address32_hi);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_sw_info(address32_hi) failed.\n");
-               return false;
-       }
-
-       r = amdgpu_query_gds_info(dev, &gds);
-       if (r) {
-               fprintf(stderr, "amdgpu: amdgpu_query_gds_info failed.\n");
-               return false;
-       }
-
-       if (info->drm_minor >= 9) {
-               struct drm_amdgpu_memory_info meminfo = {};
-
-               r = amdgpu_query_info(dev, AMDGPU_INFO_MEMORY, sizeof(meminfo), &meminfo);
-               if (r) {
-                       fprintf(stderr, "amdgpu: amdgpu_query_info(memory) failed.\n");
-                       return false;
-               }
-
-               /* Note: usable_heap_size values can be random and can't be relied on. */
-               info->gart_size = meminfo.gtt.total_heap_size;
-               info->vram_size = fix_vram_size(meminfo.vram.total_heap_size);
-               info->vram_vis_size = meminfo.cpu_accessible_vram.total_heap_size;
-       } else {
-               /* This is a deprecated interface, which reports usable sizes
-                * (total minus pinned), but the pinned size computation is
-                * buggy, so the values returned from these functions can be
-                * random.
-                */
-               struct amdgpu_heap_info vram, vram_vis, gtt;
-
-               r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &vram);
-               if (r) {
-                       fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram) failed.\n");
-                       return false;
-               }
-
-               r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM,
-                                       AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED,
-                                       &vram_vis);
-               if (r) {
-                       fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram_vis) failed.\n");
-                       return false;
-               }
-
-               r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_GTT, 0, &gtt);
-               if (r) {
-                       fprintf(stderr, "amdgpu: amdgpu_query_heap_info(gtt) failed.\n");
-                       return false;
-               }
-
-               info->gart_size = gtt.heap_size;
-               info->vram_size = fix_vram_size(vram.heap_size);
-               info->vram_vis_size = vram_vis.heap_size;
-       }
-
-       /* Set chip identification. */
-       info->pci_id = amdinfo->asic_id; /* TODO: is this correct? */
-       info->pci_rev_id = amdinfo->pci_rev_id;
-       info->vce_harvest_config = amdinfo->vce_harvest_config;
-
-#define identify_chip2(asic, chipname) \
-       if (ASICREV_IS(amdinfo->chip_external_rev, asic)) { \
-               info->family = CHIP_##chipname; \
-               info->name = #chipname; \
-       }
+   struct drm_amdgpu_info_device device_info = {};
+   struct amdgpu_buffer_size_alignments alignment_info = {};
+   struct drm_amdgpu_info_hw_ip dma = {}, compute = {}, uvd = {};
+   struct drm_amdgpu_info_hw_ip uvd_enc = {}, vce = {}, vcn_dec = {}, vcn_jpeg = {};
+   struct drm_amdgpu_info_hw_ip vcn_enc = {}, gfx = {};
+   struct amdgpu_gds_resource_info gds = {};
+   uint32_t vce_version = 0, vce_feature = 0, uvd_version = 0, uvd_feature = 0;
+   int r, i, j;
+   amdgpu_device_handle dev = dev_p;
+   drmDevicePtr devinfo;
+
+   /* Get PCI info. */
+   r = drmGetDevice2(fd, 0, &devinfo);
+   if (r) {
+      fprintf(stderr, "amdgpu: drmGetDevice2 failed.\n");
+      return false;
+   }
+   info->pci_domain = devinfo->businfo.pci->domain;
+   info->pci_bus = devinfo->businfo.pci->bus;
+   info->pci_dev = devinfo->businfo.pci->dev;
+   info->pci_func = devinfo->businfo.pci->func;
+   drmFreeDevice(&devinfo);
+
+   assert(info->drm_major == 3);
+   info->is_amdgpu = true;
+
+   /* Query hardware and driver information. */
+   r = amdgpu_query_gpu_info(dev, amdinfo);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_gpu_info failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_info(dev, AMDGPU_INFO_DEV_INFO, sizeof(device_info), &device_info);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_info(dev_info) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_buffer_size_alignment(dev, &alignment_info);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_DMA, 0, &dma);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(dma) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_GFX, 0, &gfx);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(gfx) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_COMPUTE, 0, &compute);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(compute) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_UVD, 0, &uvd);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd) failed.\n");
+      return false;
+   }
+
+   if (info->drm_minor >= 17) {
+      r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_UVD_ENC, 0, &uvd_enc);
+      if (r) {
+         fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(uvd_enc) failed.\n");
+         return false;
+      }
+   }
+
+   if (info->drm_minor >= 17) {
+      r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_DEC, 0, &vcn_dec);
+      if (r) {
+         fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_dec) failed.\n");
+         return false;
+      }
+   }
+
+   if (info->drm_minor >= 17) {
+      r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_ENC, 0, &vcn_enc);
+      if (r) {
+         fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_enc) failed.\n");
+         return false;
+      }
+   }
+
+   if (info->drm_minor >= 27) {
+      r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCN_JPEG, 0, &vcn_jpeg);
+      if (r) {
+         fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vcn_jpeg) failed.\n");
+         return false;
+      }
+   }
+
+   r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_ME, 0, 0, &info->me_fw_version,
+                                     &info->me_fw_feature);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(me) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_PFP, 0, 0, &info->pfp_fw_version,
+                                     &info->pfp_fw_feature);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(pfp) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_CE, 0, 0, &info->ce_fw_version,
+                                     &info->ce_fw_feature);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(ce) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_UVD, 0, 0, &uvd_version, &uvd_feature);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(uvd) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_hw_ip_info(dev, AMDGPU_HW_IP_VCE, 0, &vce);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_hw_ip_info(vce) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_VCE, 0, 0, &vce_version, &vce_feature);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_firmware_version(vce) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_sw_info(dev, amdgpu_sw_info_address32_hi, &info->address32_hi);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_sw_info(address32_hi) failed.\n");
+      return false;
+   }
+
+   r = amdgpu_query_gds_info(dev, &gds);
+   if (r) {
+      fprintf(stderr, "amdgpu: amdgpu_query_gds_info failed.\n");
+      return false;
+   }
+
+   if (info->drm_minor >= 9) {
+      struct drm_amdgpu_memory_info meminfo = {};
+
+      r = amdgpu_query_info(dev, AMDGPU_INFO_MEMORY, sizeof(meminfo), &meminfo);
+      if (r) {
+         fprintf(stderr, "amdgpu: amdgpu_query_info(memory) failed.\n");
+         return false;
+      }
+
+      /* Note: usable_heap_size values can be random and can't be relied on. */
+      info->gart_size = meminfo.gtt.total_heap_size;
+      info->vram_size = fix_vram_size(meminfo.vram.total_heap_size);
+      info->vram_vis_size = meminfo.cpu_accessible_vram.total_heap_size;
+   } else {
+      /* This is a deprecated interface, which reports usable sizes
+       * (total minus pinned), but the pinned size computation is
+       * buggy, so the values returned from these functions can be
+       * random.
+       */
+      struct amdgpu_heap_info vram, vram_vis, gtt;
+
+      r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM, 0, &vram);
+      if (r) {
+         fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram) failed.\n");
+         return false;
+      }
+
+      r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_VRAM, AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED,
+                                 &vram_vis);
+      if (r) {
+         fprintf(stderr, "amdgpu: amdgpu_query_heap_info(vram_vis) failed.\n");
+         return false;
+      }
+
+      r = amdgpu_query_heap_info(dev, AMDGPU_GEM_DOMAIN_GTT, 0, &gtt);
+      if (r) {
+         fprintf(stderr, "amdgpu: amdgpu_query_heap_info(gtt) failed.\n");
+         return false;
+      }
+
+      info->gart_size = gtt.heap_size;
+      info->vram_size = fix_vram_size(vram.heap_size);
+      info->vram_vis_size = vram_vis.heap_size;
+   }
+
+   /* Set chip identification. */
+   info->pci_id = amdinfo->asic_id; /* TODO: is this correct? */
+   info->pci_rev_id = amdinfo->pci_rev_id;
+   info->vce_harvest_config = amdinfo->vce_harvest_config;
+
+#define identify_chip2(asic, chipname)                                                             \
+   if (ASICREV_IS(amdinfo->chip_external_rev, asic)) {                                             \
+      info->family = CHIP_##chipname;                                                              \
+      info->name = #chipname;                                                                      \
+   }
 #define identify_chip(chipname) identify_chip2(chipname, chipname)
 
-       switch (amdinfo->family_id) {
-       case FAMILY_SI:
-               identify_chip(TAHITI);
-               identify_chip(PITCAIRN);
-               identify_chip2(CAPEVERDE, VERDE);
-               identify_chip(OLAND);
-               identify_chip(HAINAN);
-               break;
-       case FAMILY_CI:
-               identify_chip(BONAIRE);
-               identify_chip(HAWAII);
-               break;
-       case FAMILY_KV:
-               identify_chip2(SPECTRE, KAVERI);
-               identify_chip2(SPOOKY, KAVERI);
-               identify_chip2(KALINDI, KABINI);
-               identify_chip2(GODAVARI, KABINI);
-               break;
-       case FAMILY_VI:
-               identify_chip(ICELAND);
-               identify_chip(TONGA);
-               identify_chip(FIJI);
-               identify_chip(POLARIS10);
-               identify_chip(POLARIS11);
-               identify_chip(POLARIS12);
-               identify_chip(VEGAM);
-               break;
-       case FAMILY_CZ:
-               identify_chip(CARRIZO);
-               identify_chip(STONEY);
-               break;
-       case FAMILY_AI:
-               identify_chip(VEGA10);
-               identify_chip(VEGA12);
-               identify_chip(VEGA20);
-               identify_chip(ARCTURUS);
-               break;
-       case FAMILY_RV:
-               identify_chip(RAVEN);
-               identify_chip(RAVEN2);
-               identify_chip(RENOIR);
-               break;
-       case FAMILY_NV:
-               identify_chip(NAVI10);
-               identify_chip(NAVI12);
-               identify_chip(NAVI14);
-               identify_chip(SIENNA_CICHLID);
-               identify_chip(NAVY_FLOUNDER);
-               break;
-       }
-
-       if (!info->name) {
-               fprintf(stderr, "amdgpu: unknown (family_id, chip_external_rev): (%u, %u)\n",
-                       amdinfo->family_id, amdinfo->chip_external_rev);
-               return false;
-       }
-
-       if (info->family >= CHIP_SIENNA_CICHLID)
-               info->chip_class = GFX10_3;
-       else if (info->family >= CHIP_NAVI10)
-               info->chip_class = GFX10;
-       else if (info->family >= CHIP_VEGA10)
-               info->chip_class = GFX9;
-       else if (info->family >= CHIP_TONGA)
-               info->chip_class = GFX8;
-       else if (info->family >= CHIP_BONAIRE)
-               info->chip_class = GFX7;
-       else if (info->family >= CHIP_TAHITI)
-               info->chip_class = GFX6;
-       else {
-               fprintf(stderr, "amdgpu: Unknown family.\n");
-               return false;
-       }
-
-       info->family_id = amdinfo->family_id;
-       info->chip_external_rev = amdinfo->chip_external_rev;
-       info->marketing_name = amdgpu_get_marketing_name(dev);
-       info->is_pro_graphics = info->marketing_name &&
-                               (!strcmp(info->marketing_name, "Pro") ||
-                                !strcmp(info->marketing_name, "PRO") ||
-                                !strcmp(info->marketing_name, "Frontier"));
-
-       /* Set which chips have dedicated VRAM. */
-       info->has_dedicated_vram =
-               !(amdinfo->ids_flags & AMDGPU_IDS_FLAGS_FUSION);
-
-       /* The kernel can split large buffers in VRAM but not in GTT, so large
-        * allocations can fail or cause buffer movement failures in the kernel.
-        */
-       if (info->has_dedicated_vram)
-               info->max_alloc_size = info->vram_size * 0.8;
-       else
-               info->max_alloc_size = info->gart_size * 0.7;
-
-       info->vram_type = amdinfo->vram_type;
-       info->vram_bit_width = amdinfo->vram_bit_width;
-       info->ce_ram_size = amdinfo->ce_ram_size;
-
-       info->l2_cache_size = get_l2_cache_size(info->family);
-       info->l1_cache_size = 16384;
-
-       /* Set which chips have uncached device memory. */
-       info->has_l2_uncached = info->chip_class >= GFX9;
-
-       /* Set hardware information. */
-       info->gds_size = gds.gds_total_size;
-       info->gds_gfx_partition_size = gds.gds_gfx_partition_size;
-       /* convert the shader/memory clocks from KHz to MHz */
-       info->max_shader_clock = amdinfo->max_engine_clk / 1000;
-       info->max_memory_clock = amdinfo->max_memory_clk / 1000;
-       info->num_tcc_blocks = device_info.num_tcc_blocks;
-       info->max_se = amdinfo->num_shader_engines;
-       info->max_sh_per_se = amdinfo->num_shader_arrays_per_engine;
-       info->has_hw_decode =
-               (uvd.available_rings != 0) || (vcn_dec.available_rings != 0) ||
-               (vcn_jpeg.available_rings != 0);
-       info->uvd_fw_version =
-               uvd.available_rings ? uvd_version : 0;
-       info->vce_fw_version =
-               vce.available_rings ? vce_version : 0;
-       info->uvd_enc_supported =
-               uvd_enc.available_rings ? true : false;
-       info->has_userptr = true;
-       info->has_syncobj = has_syncobj(fd);
-       info->has_timeline_syncobj = has_timeline_syncobj(fd);
-       info->has_syncobj_wait_for_submit = info->has_syncobj && info->drm_minor >= 20;
-       info->has_fence_to_handle = info->has_syncobj && info->drm_minor >= 21;
-       info->has_ctx_priority = info->drm_minor >= 22;
-       info->has_local_buffers = info->drm_minor >= 20;
-       info->kernel_flushes_hdp_before_ib = true;
-       info->htile_cmask_support_1d_tiling = true;
-       info->si_TA_CS_BC_BASE_ADDR_allowed = true;
-       info->has_bo_metadata = true;
-       info->has_gpu_reset_status_query = true;
-       info->has_eqaa_surface_allocator = true;
-       info->has_format_bc1_through_bc7 = true;
-       /* DRM 3.1.0 doesn't flush TC for GFX8 correctly. */
-       info->kernel_flushes_tc_l2_after_ib = info->chip_class != GFX8 ||
-                                             info->drm_minor >= 2;
-       info->has_indirect_compute_dispatch = true;
-       /* GFX6 doesn't support unaligned loads. */
-       info->has_unaligned_shader_loads = info->chip_class != GFX6;
-       /* Disable sparse mappings on GFX6 due to VM faults in CP DMA. Enable them once
-        * these faults are mitigated in software.
-        */
-       info->has_sparse_vm_mappings = info->chip_class >= GFX7 && info->drm_minor >= 13;
-       info->has_2d_tiling = true;
-       info->has_read_registers_query = true;
-       info->has_scheduled_fence_dependency = info->drm_minor >= 28;
-       info->mid_command_buffer_preemption_enabled =
-               amdinfo->ids_flags & AMDGPU_IDS_FLAGS_PREEMPTION;
-
-       info->pa_sc_tile_steering_override = device_info.pa_sc_tile_steering_override;
-       info->num_render_backends = amdinfo->rb_pipes;
-       /* The value returned by the kernel driver was wrong. */
-       if (info->family == CHIP_KAVERI)
-               info->num_render_backends = 2;
-
-       info->clock_crystal_freq = amdinfo->gpu_counter_freq;
-       if (!info->clock_crystal_freq) {
-               fprintf(stderr, "amdgpu: clock crystal frequency is 0, timestamps will be wrong\n");
-               info->clock_crystal_freq = 1;
-       }
-       if (info->chip_class >= GFX10) {
-               info->tcc_cache_line_size = 128;
-
-               if (info->drm_minor >= 35) {
-                       info->tcc_harvested = device_info.tcc_disabled_mask != 0;
-               } else {
-                       /* This is a hack, but it's all we can do without a kernel upgrade. */
-                       info->tcc_harvested =
-                               (info->vram_size / info->num_tcc_blocks) != 512*1024*1024;
-               }
-       } else {
-               info->tcc_cache_line_size = 64;
-       }
-       info->gb_addr_config = amdinfo->gb_addr_cfg;
-       if (info->chip_class >= GFX9) {
-               info->num_tile_pipes = 1 << G_0098F8_NUM_PIPES(amdinfo->gb_addr_cfg);
-               info->pipe_interleave_bytes =
-                       256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(amdinfo->gb_addr_cfg);
-       } else {
-               info->num_tile_pipes = cik_get_num_tile_pipes(amdinfo);
-               info->pipe_interleave_bytes =
-                       256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(amdinfo->gb_addr_cfg);
-       }
-       info->r600_has_virtual_memory = true;
-
-       /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
-        * 16KB makes some SIMDs unoccupied).
-        *
-        * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
-        */
-       info->lds_size_per_workgroup = info->chip_class >= GFX10 ? 128 * 1024 : 64 * 1024;
-       info->lds_granularity = info->chip_class >= GFX7 ? 128 * 4 : 64 * 4;
-
-       assert(util_is_power_of_two_or_zero(dma.available_rings + 1));
-       assert(util_is_power_of_two_or_zero(compute.available_rings + 1));
-
-       info->has_graphics = gfx.available_rings > 0;
-       info->num_rings[RING_GFX] = util_bitcount(gfx.available_rings);
-       info->num_rings[RING_COMPUTE] = util_bitcount(compute.available_rings);
-       info->num_rings[RING_DMA] = util_bitcount(dma.available_rings);
-       info->num_rings[RING_UVD] = util_bitcount(uvd.available_rings);
-       info->num_rings[RING_VCE] = util_bitcount(vce.available_rings);
-       info->num_rings[RING_UVD_ENC] = util_bitcount(uvd_enc.available_rings);
-       info->num_rings[RING_VCN_DEC] = util_bitcount(vcn_dec.available_rings);
-       info->num_rings[RING_VCN_ENC] = util_bitcount(vcn_enc.available_rings);
-       info->num_rings[RING_VCN_JPEG] = util_bitcount(vcn_jpeg.available_rings);
-
-       /* This is "align_mask" copied from the kernel, maximums of all IP versions. */
-       info->ib_pad_dw_mask[RING_GFX] = 0xff;
-       info->ib_pad_dw_mask[RING_COMPUTE] = 0xff;
-       info->ib_pad_dw_mask[RING_DMA] = 0xf;
-       info->ib_pad_dw_mask[RING_UVD] = 0xf;
-       info->ib_pad_dw_mask[RING_VCE] = 0x3f;
-       info->ib_pad_dw_mask[RING_UVD_ENC] = 0x3f;
-       info->ib_pad_dw_mask[RING_VCN_DEC] = 0xf;
-       info->ib_pad_dw_mask[RING_VCN_ENC] = 0x3f;
-       info->ib_pad_dw_mask[RING_VCN_JPEG] = 0xf;
-
-       /* The mere presence of CLEAR_STATE in the IB causes random GPU hangs
-        * on GFX6. Some CLEAR_STATE cause asic hang on radeon kernel, etc.
-        * SPI_VS_OUT_CONFIG. So only enable GFX7 CLEAR_STATE on amdgpu kernel.
-        */
-       info->has_clear_state = info->chip_class >= GFX7;
-
-       info->has_distributed_tess = info->chip_class >= GFX10 ||
-                                    (info->chip_class >= GFX8 && info->max_se >= 2);
-
-       info->has_dcc_constant_encode = info->family == CHIP_RAVEN2 ||
-                                       info->family == CHIP_RENOIR ||
-                                       info->chip_class >= GFX10;
-
-       info->has_rbplus = info->family == CHIP_STONEY ||
-                          info->chip_class >= GFX9;
-
-       /* Some chips have RB+ registers, but don't support RB+. Those must
-        * always disable it.
-        */
-       info->rbplus_allowed = info->has_rbplus &&
-                              (info->family == CHIP_STONEY ||
-                               info->family == CHIP_VEGA12 ||
-                               info->family == CHIP_RAVEN ||
-                               info->family == CHIP_RAVEN2 ||
-                               info->family == CHIP_RENOIR ||
-                               info->chip_class >= GFX10_3);
-
-       info->has_out_of_order_rast = info->chip_class >= GFX8 &&
-                                     info->chip_class <= GFX9 &&
-                                     info->max_se >= 2;
-
-       /* Whether chips support double rate packed math instructions. */
-       info->has_packed_math_16bit = info->chip_class >= GFX9;
-
-       /* TODO: Figure out how to use LOAD_CONTEXT_REG on GFX6-GFX7. */
-       info->has_load_ctx_reg_pkt = info->chip_class >= GFX9 ||
-                                    (info->chip_class >= GFX8 &&
-                                     info->me_fw_feature >= 41);
-
-       info->cpdma_prefetch_writes_memory = info->chip_class <= GFX8;
-
-       info->has_gfx9_scissor_bug = info->family == CHIP_VEGA10 ||
-                                    info->family == CHIP_RAVEN;
-
-       info->has_tc_compat_zrange_bug = info->chip_class >= GFX8 &&
-                                        info->chip_class <= GFX9;
-
-       info->has_msaa_sample_loc_bug = (info->family >= CHIP_POLARIS10 &&
-                                        info->family <= CHIP_POLARIS12) ||
-                                       info->family == CHIP_VEGA10 ||
-                                       info->family == CHIP_RAVEN;
-
-       info->has_ls_vgpr_init_bug = info->family == CHIP_VEGA10 ||
-                                    info->family == CHIP_RAVEN;
-
-       /* Get the number of good compute units. */
-       info->num_good_compute_units = 0;
-       for (i = 0; i < info->max_se; i++) {
-               for (j = 0; j < info->max_sh_per_se; j++) {
-                       /*
-                        * The cu bitmap in amd gpu info structure is
-                        * 4x4 size array, and it's usually suitable for Vega
-                        * ASICs which has 4*2 SE/SH layout.
-                        * But for Arcturus, SE/SH layout is changed to 8*1.
-                        * To mostly reduce the impact, we make it compatible
-                        * with current bitmap array as below:
-                        *    SE4,SH0 --> cu_bitmap[0][1]
-                        *    SE5,SH0 --> cu_bitmap[1][1]
-                        *    SE6,SH0 --> cu_bitmap[2][1]
-                        *    SE7,SH0 --> cu_bitmap[3][1]
-                        */
-                       info->cu_mask[i%4][j+i/4] = amdinfo->cu_bitmap[i%4][j+i/4];
-                       info->num_good_compute_units +=
-                               util_bitcount(info->cu_mask[i][j]);
-               }
-       }
-
-       /* On GFX10, only whole WGPs (in units of 2 CUs) can be disabled,
-        * and max - min <= 2.
-        */
-       unsigned cu_group = info->chip_class >= GFX10 ? 2 : 1;
-       info->max_good_cu_per_sa = DIV_ROUND_UP(info->num_good_compute_units,
-                                               (info->max_se * info->max_sh_per_se * cu_group)) * cu_group;
-       info->min_good_cu_per_sa = (info->num_good_compute_units /
-                                   (info->max_se * info->max_sh_per_se * cu_group)) * cu_group;
-
-       memcpy(info->si_tile_mode_array, amdinfo->gb_tile_mode,
-               sizeof(amdinfo->gb_tile_mode));
-       info->enabled_rb_mask = amdinfo->enabled_rb_pipes_mask;
-
-       memcpy(info->cik_macrotile_mode_array, amdinfo->gb_macro_tile_mode,
-               sizeof(amdinfo->gb_macro_tile_mode));
-
-       info->pte_fragment_size = alignment_info.size_local;
-       info->gart_page_size = alignment_info.size_remote;
-
-       if (info->chip_class == GFX6)
-               info->gfx_ib_pad_with_type2 = true;
-
-       unsigned ib_align = 0;
-       ib_align = MAX2(ib_align, gfx.ib_start_alignment);
-       ib_align = MAX2(ib_align, gfx.ib_size_alignment);
-       ib_align = MAX2(ib_align, compute.ib_start_alignment);
-       ib_align = MAX2(ib_align, compute.ib_size_alignment);
-       ib_align = MAX2(ib_align, dma.ib_start_alignment);
-       ib_align = MAX2(ib_align, dma.ib_size_alignment);
-       ib_align = MAX2(ib_align, uvd.ib_start_alignment);
-       ib_align = MAX2(ib_align, uvd.ib_size_alignment);
-       ib_align = MAX2(ib_align, uvd_enc.ib_start_alignment);
-       ib_align = MAX2(ib_align, uvd_enc.ib_size_alignment);
-       ib_align = MAX2(ib_align, vce.ib_start_alignment);
-       ib_align = MAX2(ib_align, vce.ib_size_alignment);
-       ib_align = MAX2(ib_align, vcn_dec.ib_start_alignment);
-       ib_align = MAX2(ib_align, vcn_dec.ib_size_alignment);
-       ib_align = MAX2(ib_align, vcn_enc.ib_start_alignment);
-       ib_align = MAX2(ib_align, vcn_enc.ib_size_alignment);
-       ib_align = MAX2(ib_align, vcn_jpeg.ib_start_alignment);
-       ib_align = MAX2(ib_align, vcn_jpeg.ib_size_alignment);
-       /* GFX10 and maybe GFX9 need this alignment for cache coherency. */
-       if (info->chip_class >= GFX9)
-               ib_align = MAX2(ib_align, info->tcc_cache_line_size);
-       /* The kernel pads gfx and compute IBs to 256 dwords since:
-        *   66f3b2d527154bd258a57c8815004b5964aa1cf5
-        * Do the same.
-        */
-       ib_align = MAX2(ib_align, 1024);
-       info->ib_alignment = ib_align;
-
-        if ((info->drm_minor >= 31 &&
-             (info->family == CHIP_RAVEN ||
-              info->family == CHIP_RAVEN2 ||
-              info->family == CHIP_RENOIR)) ||
-            (info->drm_minor >= 34 &&
-             (info->family == CHIP_NAVI12 ||
-              info->family == CHIP_NAVI14)) ||
-            info->chip_class >= GFX10_3) {
-               if (info->num_render_backends == 1)
-                       info->use_display_dcc_unaligned = true;
-               else
-                       info->use_display_dcc_with_retile_blit = true;
-       }
-
-       info->has_gds_ordered_append = info->chip_class >= GFX7 &&
-                                      info->drm_minor >= 29;
-
-       if (info->chip_class >= GFX9) {
-               unsigned pc_lines = 0;
-
-               switch (info->family) {
-               case CHIP_VEGA10:
-               case CHIP_VEGA12:
-               case CHIP_VEGA20:
-                       pc_lines = 2048;
-                       break;
-               case CHIP_RAVEN:
-               case CHIP_RAVEN2:
-               case CHIP_RENOIR:
-               case CHIP_NAVI10:
-               case CHIP_NAVI12:
-               case CHIP_SIENNA_CICHLID:
-               case CHIP_NAVY_FLOUNDER:
-                       pc_lines = 1024;
-                       break;
-               case CHIP_NAVI14:
-                       pc_lines = 512;
-                       break;
-               case CHIP_ARCTURUS:
-                       break;
-               default:
-                       assert(0);
-               }
-
-               info->pc_lines = pc_lines;
-
-               if (info->chip_class >= GFX10) {
-                       info->pbb_max_alloc_count = pc_lines / 3;
-               } else {
-                       info->pbb_max_alloc_count =
-                               MIN2(128, pc_lines / (4 * info->max_se));
-               }
-       }
-
-       /* The number of SDPs is the same as the number of TCCs for now. */
-       if (info->chip_class >= GFX10)
-               info->num_sdp_interfaces = device_info.num_tcc_blocks;
-
-       if (info->chip_class >= GFX10_3)
-               info->max_wave64_per_simd = 16;
-       else if (info->chip_class == GFX10)
-               info->max_wave64_per_simd = 20;
-       else if (info->family >= CHIP_POLARIS10 && info->family <= CHIP_VEGAM)
-               info->max_wave64_per_simd = 8;
-       else
-               info->max_wave64_per_simd = 10;
-
-       if (info->chip_class >= GFX10) {
-               info->num_physical_sgprs_per_simd = 128 * info->max_wave64_per_simd;
-               info->min_sgpr_alloc = 128;
-               info->sgpr_alloc_granularity = 128;
-               /* Don't use late alloc on small chips. */
-               info->use_late_alloc = info->num_render_backends > 4;
-       } else if (info->chip_class >= GFX8) {
-               info->num_physical_sgprs_per_simd = 800;
-               info->min_sgpr_alloc = 16;
-               info->sgpr_alloc_granularity = 16;
-               info->use_late_alloc = true;
-       } else {
-               info->num_physical_sgprs_per_simd = 512;
-               info->min_sgpr_alloc = 8;
-               info->sgpr_alloc_granularity = 8;
-               /* Potential hang on Kabini: */
-               info->use_late_alloc = info->family != CHIP_KABINI;
-       }
-
-       info->max_sgpr_alloc = info->family == CHIP_TONGA ||
-                              info->family == CHIP_ICELAND ? 96 : 104;
-
-       info->min_wave64_vgpr_alloc = 4;
-       info->max_vgpr_alloc = 256;
-       info->wave64_vgpr_alloc_granularity = 4;
-
-       info->num_physical_wave64_vgprs_per_simd = info->chip_class >= GFX10 ? 512 : 256;
-       info->num_simd_per_compute_unit = info->chip_class >= GFX10 ? 2 : 4;
-
-       return true;
+   switch (amdinfo->family_id) {
+   case FAMILY_SI:
+      identify_chip(TAHITI);
+      identify_chip(PITCAIRN);
+      identify_chip2(CAPEVERDE, VERDE);
+      identify_chip(OLAND);
+      identify_chip(HAINAN);
+      break;
+   case FAMILY_CI:
+      identify_chip(BONAIRE);
+      identify_chip(HAWAII);
+      break;
+   case FAMILY_KV:
+      identify_chip2(SPECTRE, KAVERI);
+      identify_chip2(SPOOKY, KAVERI);
+      identify_chip2(KALINDI, KABINI);
+      identify_chip2(GODAVARI, KABINI);
+      break;
+   case FAMILY_VI:
+      identify_chip(ICELAND);
+      identify_chip(TONGA);
+      identify_chip(FIJI);
+      identify_chip(POLARIS10);
+      identify_chip(POLARIS11);
+      identify_chip(POLARIS12);
+      identify_chip(VEGAM);
+      break;
+   case FAMILY_CZ:
+      identify_chip(CARRIZO);
+      identify_chip(STONEY);
+      break;
+   case FAMILY_AI:
+      identify_chip(VEGA10);
+      identify_chip(VEGA12);
+      identify_chip(VEGA20);
+      identify_chip(ARCTURUS);
+      break;
+   case FAMILY_RV:
+      identify_chip(RAVEN);
+      identify_chip(RAVEN2);
+      identify_chip(RENOIR);
+      break;
+   case FAMILY_NV:
+      identify_chip(NAVI10);
+      identify_chip(NAVI12);
+      identify_chip(NAVI14);
+      identify_chip(SIENNA_CICHLID);
+      identify_chip(NAVY_FLOUNDER);
+      break;
+   }
+
+   if (!info->name) {
+      fprintf(stderr, "amdgpu: unknown (family_id, chip_external_rev): (%u, %u)\n",
+              amdinfo->family_id, amdinfo->chip_external_rev);
+      return false;
+   }
+
+   if (info->family >= CHIP_SIENNA_CICHLID)
+      info->chip_class = GFX10_3;
+   else if (info->family >= CHIP_NAVI10)
+      info->chip_class = GFX10;
+   else if (info->family >= CHIP_VEGA10)
+      info->chip_class = GFX9;
+   else if (info->family >= CHIP_TONGA)
+      info->chip_class = GFX8;
+   else if (info->family >= CHIP_BONAIRE)
+      info->chip_class = GFX7;
+   else if (info->family >= CHIP_TAHITI)
+      info->chip_class = GFX6;
+   else {
+      fprintf(stderr, "amdgpu: Unknown family.\n");
+      return false;
+   }
+
+   info->family_id = amdinfo->family_id;
+   info->chip_external_rev = amdinfo->chip_external_rev;
+   info->marketing_name = amdgpu_get_marketing_name(dev);
+   info->is_pro_graphics = info->marketing_name && (!strcmp(info->marketing_name, "Pro") ||
+                                                    !strcmp(info->marketing_name, "PRO") ||
+                                                    !strcmp(info->marketing_name, "Frontier"));
+
+   /* Set which chips have dedicated VRAM. */
+   info->has_dedicated_vram = !(amdinfo->ids_flags & AMDGPU_IDS_FLAGS_FUSION);
+
+   /* The kernel can split large buffers in VRAM but not in GTT, so large
+    * allocations can fail or cause buffer movement failures in the kernel.
+    */
+   if (info->has_dedicated_vram)
+      info->max_alloc_size = info->vram_size * 0.8;
+   else
+      info->max_alloc_size = info->gart_size * 0.7;
+
+   info->vram_type = amdinfo->vram_type;
+   info->vram_bit_width = amdinfo->vram_bit_width;
+   info->ce_ram_size = amdinfo->ce_ram_size;
+
+   info->l2_cache_size = get_l2_cache_size(info->family);
+   info->l1_cache_size = 16384;
+
+   /* Set which chips have uncached device memory. */
+   info->has_l2_uncached = info->chip_class >= GFX9;
+
+   /* Set hardware information. */
+   info->gds_size = gds.gds_total_size;
+   info->gds_gfx_partition_size = gds.gds_gfx_partition_size;
+   /* convert the shader/memory clocks from KHz to MHz */
+   info->max_shader_clock = amdinfo->max_engine_clk / 1000;
+   info->max_memory_clock = amdinfo->max_memory_clk / 1000;
+   info->num_tcc_blocks = device_info.num_tcc_blocks;
+   info->max_se = amdinfo->num_shader_engines;
+   info->max_sh_per_se = amdinfo->num_shader_arrays_per_engine;
+   info->has_hw_decode = (uvd.available_rings != 0) || (vcn_dec.available_rings != 0) ||
+                         (vcn_jpeg.available_rings != 0);
+   info->uvd_fw_version = uvd.available_rings ? uvd_version : 0;
+   info->vce_fw_version = vce.available_rings ? vce_version : 0;
+   info->uvd_enc_supported = uvd_enc.available_rings ? true : false;
+   info->has_userptr = true;
+   info->has_syncobj = has_syncobj(fd);
+   info->has_timeline_syncobj = has_timeline_syncobj(fd);
+   info->has_syncobj_wait_for_submit = info->has_syncobj && info->drm_minor >= 20;
+   info->has_fence_to_handle = info->has_syncobj && info->drm_minor >= 21;
+   info->has_ctx_priority = info->drm_minor >= 22;
+   info->has_local_buffers = info->drm_minor >= 20;
+   info->kernel_flushes_hdp_before_ib = true;
+   info->htile_cmask_support_1d_tiling = true;
+   info->si_TA_CS_BC_BASE_ADDR_allowed = true;
+   info->has_bo_metadata = true;
+   info->has_gpu_reset_status_query = true;
+   info->has_eqaa_surface_allocator = true;
+   info->has_format_bc1_through_bc7 = true;
+   /* DRM 3.1.0 doesn't flush TC for GFX8 correctly. */
+   info->kernel_flushes_tc_l2_after_ib = info->chip_class != GFX8 || info->drm_minor >= 2;
+   info->has_indirect_compute_dispatch = true;
+   /* GFX6 doesn't support unaligned loads. */
+   info->has_unaligned_shader_loads = info->chip_class != GFX6;
+   /* Disable sparse mappings on GFX6 due to VM faults in CP DMA. Enable them once
+    * these faults are mitigated in software.
+    */
+   info->has_sparse_vm_mappings = info->chip_class >= GFX7 && info->drm_minor >= 13;
+   info->has_2d_tiling = true;
+   info->has_read_registers_query = true;
+   info->has_scheduled_fence_dependency = info->drm_minor >= 28;
+   info->mid_command_buffer_preemption_enabled = amdinfo->ids_flags & AMDGPU_IDS_FLAGS_PREEMPTION;
+
+   info->pa_sc_tile_steering_override = device_info.pa_sc_tile_steering_override;
+   info->num_render_backends = amdinfo->rb_pipes;
+   /* The value returned by the kernel driver was wrong. */
+   if (info->family == CHIP_KAVERI)
+      info->num_render_backends = 2;
+
+   info->clock_crystal_freq = amdinfo->gpu_counter_freq;
+   if (!info->clock_crystal_freq) {
+      fprintf(stderr, "amdgpu: clock crystal frequency is 0, timestamps will be wrong\n");
+      info->clock_crystal_freq = 1;
+   }
+   if (info->chip_class >= GFX10) {
+      info->tcc_cache_line_size = 128;
+
+      if (info->drm_minor >= 35) {
+         info->tcc_harvested = device_info.tcc_disabled_mask != 0;
+      } else {
+         /* This is a hack, but it's all we can do without a kernel upgrade. */
+         info->tcc_harvested = (info->vram_size / info->num_tcc_blocks) != 512 * 1024 * 1024;
+      }
+   } else {
+      info->tcc_cache_line_size = 64;
+   }
+   info->gb_addr_config = amdinfo->gb_addr_cfg;
+   if (info->chip_class >= GFX9) {
+      info->num_tile_pipes = 1 << G_0098F8_NUM_PIPES(amdinfo->gb_addr_cfg);
+      info->pipe_interleave_bytes = 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(amdinfo->gb_addr_cfg);
+   } else {
+      info->num_tile_pipes = cik_get_num_tile_pipes(amdinfo);
+      info->pipe_interleave_bytes = 256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(amdinfo->gb_addr_cfg);
+   }
+   info->r600_has_virtual_memory = true;
+
+   /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage above
+    * 16KB makes some SIMDs unoccupied).
+    *
+    * LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
+    */
+   info->lds_size_per_workgroup = info->chip_class >= GFX10 ? 128 * 1024 : 64 * 1024;
+   info->lds_granularity = info->chip_class >= GFX7 ? 128 * 4 : 64 * 4;
+
+   assert(util_is_power_of_two_or_zero(dma.available_rings + 1));
+   assert(util_is_power_of_two_or_zero(compute.available_rings + 1));
+
+   info->has_graphics = gfx.available_rings > 0;
+   info->num_rings[RING_GFX] = util_bitcount(gfx.available_rings);
+   info->num_rings[RING_COMPUTE] = util_bitcount(compute.available_rings);
+   info->num_rings[RING_DMA] = util_bitcount(dma.available_rings);
+   info->num_rings[RING_UVD] = util_bitcount(uvd.available_rings);
+   info->num_rings[RING_VCE] = util_bitcount(vce.available_rings);
+   info->num_rings[RING_UVD_ENC] = util_bitcount(uvd_enc.available_rings);
+   info->num_rings[RING_VCN_DEC] = util_bitcount(vcn_dec.available_rings);
+   info->num_rings[RING_VCN_ENC] = util_bitcount(vcn_enc.available_rings);
+   info->num_rings[RING_VCN_JPEG] = util_bitcount(vcn_jpeg.available_rings);
+
+   /* This is "align_mask" copied from the kernel, maximums of all IP versions. */
+   info->ib_pad_dw_mask[RING_GFX] = 0xff;
+   info->ib_pad_dw_mask[RING_COMPUTE] = 0xff;
+   info->ib_pad_dw_mask[RING_DMA] = 0xf;
+   info->ib_pad_dw_mask[RING_UVD] = 0xf;
+   info->ib_pad_dw_mask[RING_VCE] = 0x3f;
+   info->ib_pad_dw_mask[RING_UVD_ENC] = 0x3f;
+   info->ib_pad_dw_mask[RING_VCN_DEC] = 0xf;
+   info->ib_pad_dw_mask[RING_VCN_ENC] = 0x3f;
+   info->ib_pad_dw_mask[RING_VCN_JPEG] = 0xf;
+
+   /* The mere presence of CLEAR_STATE in the IB causes random GPU hangs
+    * on GFX6. Some CLEAR_STATE cause asic hang on radeon kernel, etc.
+    * SPI_VS_OUT_CONFIG. So only enable GFX7 CLEAR_STATE on amdgpu kernel.
+    */
+   info->has_clear_state = info->chip_class >= GFX7;
+
+   info->has_distributed_tess =
+      info->chip_class >= GFX10 || (info->chip_class >= GFX8 && info->max_se >= 2);
+
+   info->has_dcc_constant_encode =
+      info->family == CHIP_RAVEN2 || info->family == CHIP_RENOIR || info->chip_class >= GFX10;
+
+   info->has_rbplus = info->family == CHIP_STONEY || info->chip_class >= GFX9;
+
+   /* Some chips have RB+ registers, but don't support RB+. Those must
+    * always disable it.
+    */
+   info->rbplus_allowed =
+      info->has_rbplus &&
+      (info->family == CHIP_STONEY || info->family == CHIP_VEGA12 || info->family == CHIP_RAVEN ||
+       info->family == CHIP_RAVEN2 || info->family == CHIP_RENOIR || info->chip_class >= GFX10_3);
+
+   info->has_out_of_order_rast =
+      info->chip_class >= GFX8 && info->chip_class <= GFX9 && info->max_se >= 2;
+
+   /* Whether chips support double rate packed math instructions. */
+   info->has_packed_math_16bit = info->chip_class >= GFX9;
+
+   /* TODO: Figure out how to use LOAD_CONTEXT_REG on GFX6-GFX7. */
+   info->has_load_ctx_reg_pkt =
+      info->chip_class >= GFX9 || (info->chip_class >= GFX8 && info->me_fw_feature >= 41);
+
+   info->cpdma_prefetch_writes_memory = info->chip_class <= GFX8;
+
+   info->has_gfx9_scissor_bug = info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN;
+
+   info->has_tc_compat_zrange_bug = info->chip_class >= GFX8 && info->chip_class <= GFX9;
+
+   info->has_msaa_sample_loc_bug =
+      (info->family >= CHIP_POLARIS10 && info->family <= CHIP_POLARIS12) ||
+      info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN;
+
+   info->has_ls_vgpr_init_bug = info->family == CHIP_VEGA10 || info->family == CHIP_RAVEN;
+
+   /* Get the number of good compute units. */
+   info->num_good_compute_units = 0;
+   for (i = 0; i < info->max_se; i++) {
+      for (j = 0; j < info->max_sh_per_se; j++) {
+         /*
+          * The cu bitmap in amd gpu info structure is
+          * 4x4 size array, and it's usually suitable for Vega
+          * ASICs which has 4*2 SE/SH layout.
+          * But for Arcturus, SE/SH layout is changed to 8*1.
+          * To mostly reduce the impact, we make it compatible
+          * with current bitmap array as below:
+          *    SE4,SH0 --> cu_bitmap[0][1]
+          *    SE5,SH0 --> cu_bitmap[1][1]
+          *    SE6,SH0 --> cu_bitmap[2][1]
+          *    SE7,SH0 --> cu_bitmap[3][1]
+          */
+         info->cu_mask[i % 4][j + i / 4] = amdinfo->cu_bitmap[i % 4][j + i / 4];
+         info->num_good_compute_units += util_bitcount(info->cu_mask[i][j]);
+      }
+   }
+
+   /* On GFX10, only whole WGPs (in units of 2 CUs) can be disabled,
+    * and max - min <= 2.
+    */
+   unsigned cu_group = info->chip_class >= GFX10 ? 2 : 1;
+   info->max_good_cu_per_sa =
+      DIV_ROUND_UP(info->num_good_compute_units, (info->max_se * info->max_sh_per_se * cu_group)) *
+      cu_group;
+   info->min_good_cu_per_sa =
+      (info->num_good_compute_units / (info->max_se * info->max_sh_per_se * cu_group)) * cu_group;
+
+   memcpy(info->si_tile_mode_array, amdinfo->gb_tile_mode, sizeof(amdinfo->gb_tile_mode));
+   info->enabled_rb_mask = amdinfo->enabled_rb_pipes_mask;
+
+   memcpy(info->cik_macrotile_mode_array, amdinfo->gb_macro_tile_mode,
+          sizeof(amdinfo->gb_macro_tile_mode));
+
+   info->pte_fragment_size = alignment_info.size_local;
+   info->gart_page_size = alignment_info.size_remote;
+
+   if (info->chip_class == GFX6)
+      info->gfx_ib_pad_with_type2 = true;
+
+   unsigned ib_align = 0;
+   ib_align = MAX2(ib_align, gfx.ib_start_alignment);
+   ib_align = MAX2(ib_align, gfx.ib_size_alignment);
+   ib_align = MAX2(ib_align, compute.ib_start_alignment);
+   ib_align = MAX2(ib_align, compute.ib_size_alignment);
+   ib_align = MAX2(ib_align, dma.ib_start_alignment);
+   ib_align = MAX2(ib_align, dma.ib_size_alignment);
+   ib_align = MAX2(ib_align, uvd.ib_start_alignment);
+   ib_align = MAX2(ib_align, uvd.ib_size_alignment);
+   ib_align = MAX2(ib_align, uvd_enc.ib_start_alignment);
+   ib_align = MAX2(ib_align, uvd_enc.ib_size_alignment);
+   ib_align = MAX2(ib_align, vce.ib_start_alignment);
+   ib_align = MAX2(ib_align, vce.ib_size_alignment);
+   ib_align = MAX2(ib_align, vcn_dec.ib_start_alignment);
+   ib_align = MAX2(ib_align, vcn_dec.ib_size_alignment);
+   ib_align = MAX2(ib_align, vcn_enc.ib_start_alignment);
+   ib_align = MAX2(ib_align, vcn_enc.ib_size_alignment);
+   ib_align = MAX2(ib_align, vcn_jpeg.ib_start_alignment);
+   ib_align = MAX2(ib_align, vcn_jpeg.ib_size_alignment);
+   /* GFX10 and maybe GFX9 need this alignment for cache coherency. */
+   if (info->chip_class >= GFX9)
+      ib_align = MAX2(ib_align, info->tcc_cache_line_size);
+   /* The kernel pads gfx and compute IBs to 256 dwords since:
+    *   66f3b2d527154bd258a57c8815004b5964aa1cf5
+    * Do the same.
+    */
+   ib_align = MAX2(ib_align, 1024);
+   info->ib_alignment = ib_align;
+
+   if ((info->drm_minor >= 31 && (info->family == CHIP_RAVEN || info->family == CHIP_RAVEN2 ||
+                                  info->family == CHIP_RENOIR)) ||
+       (info->drm_minor >= 34 && (info->family == CHIP_NAVI12 || info->family == CHIP_NAVI14)) ||
+       info->chip_class >= GFX10_3) {
+      if (info->num_render_backends == 1)
+         info->use_display_dcc_unaligned = true;
+      else
+         info->use_display_dcc_with_retile_blit = true;
+   }
+
+   info->has_gds_ordered_append = info->chip_class >= GFX7 && info->drm_minor >= 29;
+
+   if (info->chip_class >= GFX9) {
+      unsigned pc_lines = 0;
+
+      switch (info->family) {
+      case CHIP_VEGA10:
+      case CHIP_VEGA12:
+      case CHIP_VEGA20:
+         pc_lines = 2048;
+         break;
+      case CHIP_RAVEN:
+      case CHIP_RAVEN2:
+      case CHIP_RENOIR:
+      case CHIP_NAVI10:
+      case CHIP_NAVI12:
+      case CHIP_SIENNA_CICHLID:
+      case CHIP_NAVY_FLOUNDER:
+         pc_lines = 1024;
+         break;
+      case CHIP_NAVI14:
+         pc_lines = 512;
+         break;
+      case CHIP_ARCTURUS:
+         break;
+      default:
+         assert(0);
+      }
+
+      info->pc_lines = pc_lines;
+
+      if (info->chip_class >= GFX10) {
+         info->pbb_max_alloc_count = pc_lines / 3;
+      } else {
+         info->pbb_max_alloc_count = MIN2(128, pc_lines / (4 * info->max_se));
+      }
+   }
+
+   /* The number of SDPs is the same as the number of TCCs for now. */
+   if (info->chip_class >= GFX10)
+      info->num_sdp_interfaces = device_info.num_tcc_blocks;
+
+   if (info->chip_class >= GFX10_3)
+      info->max_wave64_per_simd = 16;
+   else if (info->chip_class == GFX10)
+      info->max_wave64_per_simd = 20;
+   else if (info->family >= CHIP_POLARIS10 && info->family <= CHIP_VEGAM)
+      info->max_wave64_per_simd = 8;
+   else
+      info->max_wave64_per_simd = 10;
+
+   if (info->chip_class >= GFX10) {
+      info->num_physical_sgprs_per_simd = 128 * info->max_wave64_per_simd;
+      info->min_sgpr_alloc = 128;
+      info->sgpr_alloc_granularity = 128;
+      /* Don't use late alloc on small chips. */
+      info->use_late_alloc = info->num_render_backends > 4;
+   } else if (info->chip_class >= GFX8) {
+      info->num_physical_sgprs_per_simd = 800;
+      info->min_sgpr_alloc = 16;
+      info->sgpr_alloc_granularity = 16;
+      info->use_late_alloc = true;
+   } else {
+      info->num_physical_sgprs_per_simd = 512;
+      info->min_sgpr_alloc = 8;
+      info->sgpr_alloc_granularity = 8;
+      /* Potential hang on Kabini: */
+      info->use_late_alloc = info->family != CHIP_KABINI;
+   }
+
+   info->max_sgpr_alloc = info->family == CHIP_TONGA || info->family == CHIP_ICELAND ? 96 : 104;
+
+   info->min_wave64_vgpr_alloc = 4;
+   info->max_vgpr_alloc = 256;
+   info->wave64_vgpr_alloc_granularity = 4;
+
+   info->num_physical_wave64_vgprs_per_simd = info->chip_class >= GFX10 ? 512 : 256;
+   info->num_simd_per_compute_unit = info->chip_class >= GFX10 ? 2 : 4;
+
+   return true;
 }
 
 void ac_compute_driver_uuid(char *uuid, size_t size)
 {
-       char amd_uuid[] = "AMD-MESA-DRV";
+   char amd_uuid[] = "AMD-MESA-DRV";
 
-       assert(size >= sizeof(amd_uuid));
+   assert(size >= sizeof(amd_uuid));
 
-       memset(uuid, 0, size);
-       strncpy(uuid, amd_uuid, size);
+   memset(uuid, 0, size);
+   strncpy(uuid, amd_uuid, size);
 }
 
 void ac_compute_device_uuid(struct radeon_info *info, char *uuid, size_t size)
 {
-       uint32_t *uint_uuid = (uint32_t*)uuid;
-
-       assert(size >= sizeof(uint32_t)*4);
-
-       /**
-        * Use the device info directly instead of using a sha1. GL/VK UUIDs
-        * are 16 byte vs 20 byte for sha1, and the truncation that would be
-        * required would get rid of part of the little entropy we have.
-        * */
-       memset(uuid, 0, size);
-       uint_uuid[0] = info->pci_domain;
-       uint_uuid[1] = info->pci_bus;
-       uint_uuid[2] = info->pci_dev;
-       uint_uuid[3] = info->pci_func;
+   uint32_t *uint_uuid = (uint32_t *)uuid;
+
+   assert(size >= sizeof(uint32_t) * 4);
+
+   /**
+    * Use the device info directly instead of using a sha1. GL/VK UUIDs
+    * are 16 byte vs 20 byte for sha1, and the truncation that would be
+    * required would get rid of part of the little entropy we have.
+    * */
+   memset(uuid, 0, size);
+   uint_uuid[0] = info->pci_domain;
+   uint_uuid[1] = info->pci_bus;
+   uint_uuid[2] = info->pci_dev;
+   uint_uuid[3] = info->pci_func;
 }
 
 void ac_print_gpu_info(struct radeon_info *info)
 {
-       printf("Device info:\n");
-       printf("    pci (domain:bus:dev.func): %04x:%02x:%02x.%x\n",
-              info->pci_domain, info->pci_bus,
-              info->pci_dev, info->pci_func);
-
-       printf("    name = %s\n", info->name);
-       printf("    marketing_name = %s\n", info->marketing_name);
-       printf("    is_pro_graphics = %u\n", info->is_pro_graphics);
-       printf("    pci_id = 0x%x\n", info->pci_id);
-       printf("    pci_rev_id = 0x%x\n", info->pci_rev_id);
-       printf("    family = %i\n", info->family);
-       printf("    chip_class = %i\n", info->chip_class);
-       printf("    family_id = %i\n", info->family_id);
-       printf("    chip_external_rev = %i\n", info->chip_external_rev);
-       printf("    clock_crystal_freq = %i\n", info->clock_crystal_freq);
-
-       printf("Features:\n");
-       printf("    has_graphics = %i\n", info->has_graphics);
-       printf("    num_rings[RING_GFX] = %i\n", info->num_rings[RING_GFX]);
-       printf("    num_rings[RING_DMA] = %i\n", info->num_rings[RING_DMA]);
-       printf("    num_rings[RING_COMPUTE] = %u\n", info->num_rings[RING_COMPUTE]);
-       printf("    num_rings[RING_UVD] = %i\n", info->num_rings[RING_UVD]);
-       printf("    num_rings[RING_VCE] = %i\n", info->num_rings[RING_VCE]);
-       printf("    num_rings[RING_UVD_ENC] = %i\n", info->num_rings[RING_UVD_ENC]);
-       printf("    num_rings[RING_VCN_DEC] = %i\n", info->num_rings[RING_VCN_DEC]);
-       printf("    num_rings[RING_VCN_ENC] = %i\n", info->num_rings[RING_VCN_ENC]);
-       printf("    num_rings[RING_VCN_JPEG] = %i\n", info->num_rings[RING_VCN_JPEG]);
-       printf("    has_clear_state = %u\n", info->has_clear_state);
-       printf("    has_distributed_tess = %u\n", info->has_distributed_tess);
-       printf("    has_dcc_constant_encode = %u\n", info->has_dcc_constant_encode);
-       printf("    has_rbplus = %u\n", info->has_rbplus);
-       printf("    rbplus_allowed = %u\n", info->rbplus_allowed);
-       printf("    has_load_ctx_reg_pkt = %u\n", info->has_load_ctx_reg_pkt);
-       printf("    has_out_of_order_rast = %u\n", info->has_out_of_order_rast);
-       printf("    cpdma_prefetch_writes_memory = %u\n", info->cpdma_prefetch_writes_memory);
-       printf("    has_gfx9_scissor_bug = %i\n", info->has_gfx9_scissor_bug);
-       printf("    has_tc_compat_zrange_bug = %i\n", info->has_tc_compat_zrange_bug);
-       printf("    has_msaa_sample_loc_bug = %i\n", info->has_msaa_sample_loc_bug);
-       printf("    has_ls_vgpr_init_bug = %i\n", info->has_ls_vgpr_init_bug);
-
-       printf("Display features:\n");
-       printf("    use_display_dcc_unaligned = %u\n", info->use_display_dcc_unaligned);
-       printf("    use_display_dcc_with_retile_blit = %u\n", info->use_display_dcc_with_retile_blit);
-
-       printf("Memory info:\n");
-       printf("    pte_fragment_size = %u\n", info->pte_fragment_size);
-       printf("    gart_page_size = %u\n", info->gart_page_size);
-       printf("    gart_size = %i MB\n", (int)DIV_ROUND_UP(info->gart_size, 1024*1024));
-       printf("    vram_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_size, 1024*1024));
-       printf("    vram_vis_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_vis_size, 1024*1024));
-       printf("    vram_type = %i\n", info->vram_type);
-       printf("    vram_bit_width = %i\n", info->vram_bit_width);
-       printf("    gds_size = %u kB\n", info->gds_size / 1024);
-       printf("    gds_gfx_partition_size = %u kB\n", info->gds_gfx_partition_size / 1024);
-       printf("    max_alloc_size = %i MB\n",
-              (int)DIV_ROUND_UP(info->max_alloc_size, 1024*1024));
-       printf("    min_alloc_size = %u\n", info->min_alloc_size);
-       printf("    address32_hi = %u\n", info->address32_hi);
-       printf("    has_dedicated_vram = %u\n", info->has_dedicated_vram);
-       printf("    num_sdp_interfaces = %u\n", info->num_sdp_interfaces);
-       printf("    num_tcc_blocks = %i\n", info->num_tcc_blocks);
-       printf("    tcc_cache_line_size = %u\n", info->tcc_cache_line_size);
-       printf("    tcc_harvested = %u\n", info->tcc_harvested);
-       printf("    pc_lines = %u\n", info->pc_lines);
-       printf("    lds_size_per_workgroup = %u\n", info->lds_size_per_workgroup);
-       printf("    lds_granularity = %i\n", info->lds_granularity);
-       printf("    max_memory_clock = %i\n", info->max_memory_clock);
-       printf("    ce_ram_size = %i\n", info->ce_ram_size);
-       printf("    l1_cache_size = %i\n", info->l1_cache_size);
-       printf("    l2_cache_size = %i\n", info->l2_cache_size);
-
-       printf("CP info:\n");
-       printf("    gfx_ib_pad_with_type2 = %i\n", info->gfx_ib_pad_with_type2);
-       printf("    ib_alignment = %u\n", info->ib_alignment);
-       printf("    me_fw_version = %i\n", info->me_fw_version);
-       printf("    me_fw_feature = %i\n", info->me_fw_feature);
-       printf("    pfp_fw_version = %i\n", info->pfp_fw_version);
-       printf("    pfp_fw_feature = %i\n", info->pfp_fw_feature);
-       printf("    ce_fw_version = %i\n", info->ce_fw_version);
-       printf("    ce_fw_feature = %i\n", info->ce_fw_feature);
-
-       printf("Multimedia info:\n");
-       printf("    has_hw_decode = %u\n", info->has_hw_decode);
-       printf("    uvd_enc_supported = %u\n", info->uvd_enc_supported);
-       printf("    uvd_fw_version = %u\n", info->uvd_fw_version);
-       printf("    vce_fw_version = %u\n", info->vce_fw_version);
-       printf("    vce_harvest_config = %i\n", info->vce_harvest_config);
-
-       printf("Kernel & winsys capabilities:\n");
-       printf("    drm = %i.%i.%i\n", info->drm_major,
-              info->drm_minor, info->drm_patchlevel);
-       printf("    has_userptr = %i\n", info->has_userptr);
-       printf("    has_syncobj = %u\n", info->has_syncobj);
-       printf("    has_syncobj_wait_for_submit = %u\n", info->has_syncobj_wait_for_submit);
-       printf("    has_timeline_syncobj = %u\n", info->has_timeline_syncobj);
-       printf("    has_fence_to_handle = %u\n", info->has_fence_to_handle);
-       printf("    has_ctx_priority = %u\n", info->has_ctx_priority);
-       printf("    has_local_buffers = %u\n", info->has_local_buffers);
-       printf("    kernel_flushes_hdp_before_ib = %u\n", info->kernel_flushes_hdp_before_ib);
-       printf("    htile_cmask_support_1d_tiling = %u\n", info->htile_cmask_support_1d_tiling);
-       printf("    si_TA_CS_BC_BASE_ADDR_allowed = %u\n", info->si_TA_CS_BC_BASE_ADDR_allowed);
-       printf("    has_bo_metadata = %u\n", info->has_bo_metadata);
-       printf("    has_gpu_reset_status_query = %u\n", info->has_gpu_reset_status_query);
-       printf("    has_eqaa_surface_allocator = %u\n", info->has_eqaa_surface_allocator);
-       printf("    has_format_bc1_through_bc7 = %u\n", info->has_format_bc1_through_bc7);
-       printf("    kernel_flushes_tc_l2_after_ib = %u\n", info->kernel_flushes_tc_l2_after_ib);
-       printf("    has_indirect_compute_dispatch = %u\n", info->has_indirect_compute_dispatch);
-       printf("    has_unaligned_shader_loads = %u\n", info->has_unaligned_shader_loads);
-       printf("    has_sparse_vm_mappings = %u\n", info->has_sparse_vm_mappings);
-       printf("    has_2d_tiling = %u\n", info->has_2d_tiling);
-       printf("    has_read_registers_query = %u\n", info->has_read_registers_query);
-       printf("    has_gds_ordered_append = %u\n", info->has_gds_ordered_append);
-       printf("    has_scheduled_fence_dependency = %u\n", info->has_scheduled_fence_dependency);
-       printf("    mid_command_buffer_preemption_enabled = %u\n", info->mid_command_buffer_preemption_enabled);
-
-       printf("Shader core info:\n");
-       printf("    max_shader_clock = %i\n", info->max_shader_clock);
-       printf("    num_good_compute_units = %i\n", info->num_good_compute_units);
-       printf("    max_good_cu_per_sa = %i\n", info->max_good_cu_per_sa);
-       printf("    min_good_cu_per_sa = %i\n", info->min_good_cu_per_sa);
-       printf("    max_se = %i\n", info->max_se);
-       printf("    max_sh_per_se = %i\n", info->max_sh_per_se);
-       printf("    max_wave64_per_simd = %i\n", info->max_wave64_per_simd);
-       printf("    num_physical_sgprs_per_simd = %i\n", info->num_physical_sgprs_per_simd);
-       printf("    num_physical_wave64_vgprs_per_simd = %i\n", info->num_physical_wave64_vgprs_per_simd);
-       printf("    num_simd_per_compute_unit = %i\n", info->num_simd_per_compute_unit);
-       printf("    min_sgpr_alloc = %i\n", info->min_sgpr_alloc);
-       printf("    max_sgpr_alloc = %i\n", info->max_sgpr_alloc);
-       printf("    sgpr_alloc_granularity = %i\n", info->sgpr_alloc_granularity);
-       printf("    min_wave64_vgpr_alloc = %i\n", info->min_wave64_vgpr_alloc);
-       printf("    max_vgpr_alloc = %i\n", info->max_vgpr_alloc);
-       printf("    wave64_vgpr_alloc_granularity = %i\n", info->wave64_vgpr_alloc_granularity);
-
-       printf("Render backend info:\n");
-       printf("    pa_sc_tile_steering_override = 0x%x\n", info->pa_sc_tile_steering_override);
-       printf("    num_render_backends = %i\n", info->num_render_backends);
-       printf("    num_tile_pipes = %i\n", info->num_tile_pipes);
-       printf("    pipe_interleave_bytes = %i\n", info->pipe_interleave_bytes);
-       printf("    enabled_rb_mask = 0x%x\n", info->enabled_rb_mask);
-       printf("    max_alignment = %u\n", (unsigned)info->max_alignment);
-       printf("    pbb_max_alloc_count = %u\n", info->pbb_max_alloc_count);
-
-       printf("GB_ADDR_CONFIG: 0x%08x\n", info->gb_addr_config);
-       if (info->chip_class >= GFX10) {
-               printf("    num_pipes = %u\n",
-                      1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
-               printf("    pipe_interleave_size = %u\n",
-                      256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config));
-               printf("    max_compressed_frags = %u\n",
-                      1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config));
-       } else if (info->chip_class == GFX9) {
-               printf("    num_pipes = %u\n",
-                      1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
-               printf("    pipe_interleave_size = %u\n",
-                      256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config));
-               printf("    max_compressed_frags = %u\n",
-                      1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config));
-               printf("    bank_interleave_size = %u\n",
-                      1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config));
-               printf("    num_banks = %u\n",
-                      1 << G_0098F8_NUM_BANKS(info->gb_addr_config));
-               printf("    shader_engine_tile_size = %u\n",
-                      16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config));
-               printf("    num_shader_engines = %u\n",
-                      1 << G_0098F8_NUM_SHADER_ENGINES_GFX9(info->gb_addr_config));
-               printf("    num_gpus = %u (raw)\n",
-                      G_0098F8_NUM_GPUS_GFX9(info->gb_addr_config));
-               printf("    multi_gpu_tile_size = %u (raw)\n",
-                      G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config));
-               printf("    num_rb_per_se = %u\n",
-                      1 << G_0098F8_NUM_RB_PER_SE(info->gb_addr_config));
-               printf("    row_size = %u\n",
-                      1024 << G_0098F8_ROW_SIZE(info->gb_addr_config));
-               printf("    num_lower_pipes = %u (raw)\n",
-                      G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config));
-               printf("    se_enable = %u (raw)\n",
-                      G_0098F8_SE_ENABLE(info->gb_addr_config));
-       } else {
-               printf("    num_pipes = %u\n",
-                      1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
-               printf("    pipe_interleave_size = %u\n",
-                      256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(info->gb_addr_config));
-               printf("    bank_interleave_size = %u\n",
-                      1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config));
-               printf("    num_shader_engines = %u\n",
-                      1 << G_0098F8_NUM_SHADER_ENGINES_GFX6(info->gb_addr_config));
-               printf("    shader_engine_tile_size = %u\n",
-                      16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config));
-               printf("    num_gpus = %u (raw)\n",
-                      G_0098F8_NUM_GPUS_GFX6(info->gb_addr_config));
-               printf("    multi_gpu_tile_size = %u (raw)\n",
-                      G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config));
-               printf("    row_size = %u\n",
-                      1024 << G_0098F8_ROW_SIZE(info->gb_addr_config));
-               printf("    num_lower_pipes = %u (raw)\n",
-                      G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config));
-       }
+   printf("Device info:\n");
+   printf("    pci (domain:bus:dev.func): %04x:%02x:%02x.%x\n", info->pci_domain, info->pci_bus,
+          info->pci_dev, info->pci_func);
+
+   printf("    name = %s\n", info->name);
+   printf("    marketing_name = %s\n", info->marketing_name);
+   printf("    is_pro_graphics = %u\n", info->is_pro_graphics);
+   printf("    pci_id = 0x%x\n", info->pci_id);
+   printf("    pci_rev_id = 0x%x\n", info->pci_rev_id);
+   printf("    family = %i\n", info->family);
+   printf("    chip_class = %i\n", info->chip_class);
+   printf("    family_id = %i\n", info->family_id);
+   printf("    chip_external_rev = %i\n", info->chip_external_rev);
+   printf("    clock_crystal_freq = %i\n", info->clock_crystal_freq);
+
+   printf("Features:\n");
+   printf("    has_graphics = %i\n", info->has_graphics);
+   printf("    num_rings[RING_GFX] = %i\n", info->num_rings[RING_GFX]);
+   printf("    num_rings[RING_DMA] = %i\n", info->num_rings[RING_DMA]);
+   printf("    num_rings[RING_COMPUTE] = %u\n", info->num_rings[RING_COMPUTE]);
+   printf("    num_rings[RING_UVD] = %i\n", info->num_rings[RING_UVD]);
+   printf("    num_rings[RING_VCE] = %i\n", info->num_rings[RING_VCE]);
+   printf("    num_rings[RING_UVD_ENC] = %i\n", info->num_rings[RING_UVD_ENC]);
+   printf("    num_rings[RING_VCN_DEC] = %i\n", info->num_rings[RING_VCN_DEC]);
+   printf("    num_rings[RING_VCN_ENC] = %i\n", info->num_rings[RING_VCN_ENC]);
+   printf("    num_rings[RING_VCN_JPEG] = %i\n", info->num_rings[RING_VCN_JPEG]);
+   printf("    has_clear_state = %u\n", info->has_clear_state);
+   printf("    has_distributed_tess = %u\n", info->has_distributed_tess);
+   printf("    has_dcc_constant_encode = %u\n", info->has_dcc_constant_encode);
+   printf("    has_rbplus = %u\n", info->has_rbplus);
+   printf("    rbplus_allowed = %u\n", info->rbplus_allowed);
+   printf("    has_load_ctx_reg_pkt = %u\n", info->has_load_ctx_reg_pkt);
+   printf("    has_out_of_order_rast = %u\n", info->has_out_of_order_rast);
+   printf("    cpdma_prefetch_writes_memory = %u\n", info->cpdma_prefetch_writes_memory);
+   printf("    has_gfx9_scissor_bug = %i\n", info->has_gfx9_scissor_bug);
+   printf("    has_tc_compat_zrange_bug = %i\n", info->has_tc_compat_zrange_bug);
+   printf("    has_msaa_sample_loc_bug = %i\n", info->has_msaa_sample_loc_bug);
+   printf("    has_ls_vgpr_init_bug = %i\n", info->has_ls_vgpr_init_bug);
+
+   printf("Display features:\n");
+   printf("    use_display_dcc_unaligned = %u\n", info->use_display_dcc_unaligned);
+   printf("    use_display_dcc_with_retile_blit = %u\n", info->use_display_dcc_with_retile_blit);
+
+   printf("Memory info:\n");
+   printf("    pte_fragment_size = %u\n", info->pte_fragment_size);
+   printf("    gart_page_size = %u\n", info->gart_page_size);
+   printf("    gart_size = %i MB\n", (int)DIV_ROUND_UP(info->gart_size, 1024 * 1024));
+   printf("    vram_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_size, 1024 * 1024));
+   printf("    vram_vis_size = %i MB\n", (int)DIV_ROUND_UP(info->vram_vis_size, 1024 * 1024));
+   printf("    vram_type = %i\n", info->vram_type);
+   printf("    vram_bit_width = %i\n", info->vram_bit_width);
+   printf("    gds_size = %u kB\n", info->gds_size / 1024);
+   printf("    gds_gfx_partition_size = %u kB\n", info->gds_gfx_partition_size / 1024);
+   printf("    max_alloc_size = %i MB\n", (int)DIV_ROUND_UP(info->max_alloc_size, 1024 * 1024));
+   printf("    min_alloc_size = %u\n", info->min_alloc_size);
+   printf("    address32_hi = %u\n", info->address32_hi);
+   printf("    has_dedicated_vram = %u\n", info->has_dedicated_vram);
+   printf("    num_sdp_interfaces = %u\n", info->num_sdp_interfaces);
+   printf("    num_tcc_blocks = %i\n", info->num_tcc_blocks);
+   printf("    tcc_cache_line_size = %u\n", info->tcc_cache_line_size);
+   printf("    tcc_harvested = %u\n", info->tcc_harvested);
+   printf("    pc_lines = %u\n", info->pc_lines);
+   printf("    lds_size_per_workgroup = %u\n", info->lds_size_per_workgroup);
+   printf("    lds_granularity = %i\n", info->lds_granularity);
+   printf("    max_memory_clock = %i\n", info->max_memory_clock);
+   printf("    ce_ram_size = %i\n", info->ce_ram_size);
+   printf("    l1_cache_size = %i\n", info->l1_cache_size);
+   printf("    l2_cache_size = %i\n", info->l2_cache_size);
+
+   printf("CP info:\n");
+   printf("    gfx_ib_pad_with_type2 = %i\n", info->gfx_ib_pad_with_type2);
+   printf("    ib_alignment = %u\n", info->ib_alignment);
+   printf("    me_fw_version = %i\n", info->me_fw_version);
+   printf("    me_fw_feature = %i\n", info->me_fw_feature);
+   printf("    pfp_fw_version = %i\n", info->pfp_fw_version);
+   printf("    pfp_fw_feature = %i\n", info->pfp_fw_feature);
+   printf("    ce_fw_version = %i\n", info->ce_fw_version);
+   printf("    ce_fw_feature = %i\n", info->ce_fw_feature);
+
+   printf("Multimedia info:\n");
+   printf("    has_hw_decode = %u\n", info->has_hw_decode);
+   printf("    uvd_enc_supported = %u\n", info->uvd_enc_supported);
+   printf("    uvd_fw_version = %u\n", info->uvd_fw_version);
+   printf("    vce_fw_version = %u\n", info->vce_fw_version);
+   printf("    vce_harvest_config = %i\n", info->vce_harvest_config);
+
+   printf("Kernel & winsys capabilities:\n");
+   printf("    drm = %i.%i.%i\n", info->drm_major, info->drm_minor, info->drm_patchlevel);
+   printf("    has_userptr = %i\n", info->has_userptr);
+   printf("    has_syncobj = %u\n", info->has_syncobj);
+   printf("    has_syncobj_wait_for_submit = %u\n", info->has_syncobj_wait_for_submit);
+   printf("    has_timeline_syncobj = %u\n", info->has_timeline_syncobj);
+   printf("    has_fence_to_handle = %u\n", info->has_fence_to_handle);
+   printf("    has_ctx_priority = %u\n", info->has_ctx_priority);
+   printf("    has_local_buffers = %u\n", info->has_local_buffers);
+   printf("    kernel_flushes_hdp_before_ib = %u\n", info->kernel_flushes_hdp_before_ib);
+   printf("    htile_cmask_support_1d_tiling = %u\n", info->htile_cmask_support_1d_tiling);
+   printf("    si_TA_CS_BC_BASE_ADDR_allowed = %u\n", info->si_TA_CS_BC_BASE_ADDR_allowed);
+   printf("    has_bo_metadata = %u\n", info->has_bo_metadata);
+   printf("    has_gpu_reset_status_query = %u\n", info->has_gpu_reset_status_query);
+   printf("    has_eqaa_surface_allocator = %u\n", info->has_eqaa_surface_allocator);
+   printf("    has_format_bc1_through_bc7 = %u\n", info->has_format_bc1_through_bc7);
+   printf("    kernel_flushes_tc_l2_after_ib = %u\n", info->kernel_flushes_tc_l2_after_ib);
+   printf("    has_indirect_compute_dispatch = %u\n", info->has_indirect_compute_dispatch);
+   printf("    has_unaligned_shader_loads = %u\n", info->has_unaligned_shader_loads);
+   printf("    has_sparse_vm_mappings = %u\n", info->has_sparse_vm_mappings);
+   printf("    has_2d_tiling = %u\n", info->has_2d_tiling);
+   printf("    has_read_registers_query = %u\n", info->has_read_registers_query);
+   printf("    has_gds_ordered_append = %u\n", info->has_gds_ordered_append);
+   printf("    has_scheduled_fence_dependency = %u\n", info->has_scheduled_fence_dependency);
+   printf("    mid_command_buffer_preemption_enabled = %u\n",
+          info->mid_command_buffer_preemption_enabled);
+
+   printf("Shader core info:\n");
+   printf("    max_shader_clock = %i\n", info->max_shader_clock);
+   printf("    num_good_compute_units = %i\n", info->num_good_compute_units);
+   printf("    max_good_cu_per_sa = %i\n", info->max_good_cu_per_sa);
+   printf("    min_good_cu_per_sa = %i\n", info->min_good_cu_per_sa);
+   printf("    max_se = %i\n", info->max_se);
+   printf("    max_sh_per_se = %i\n", info->max_sh_per_se);
+   printf("    max_wave64_per_simd = %i\n", info->max_wave64_per_simd);
+   printf("    num_physical_sgprs_per_simd = %i\n", info->num_physical_sgprs_per_simd);
+   printf("    num_physical_wave64_vgprs_per_simd = %i\n",
+          info->num_physical_wave64_vgprs_per_simd);
+   printf("    num_simd_per_compute_unit = %i\n", info->num_simd_per_compute_unit);
+   printf("    min_sgpr_alloc = %i\n", info->min_sgpr_alloc);
+   printf("    max_sgpr_alloc = %i\n", info->max_sgpr_alloc);
+   printf("    sgpr_alloc_granularity = %i\n", info->sgpr_alloc_granularity);
+   printf("    min_wave64_vgpr_alloc = %i\n", info->min_wave64_vgpr_alloc);
+   printf("    max_vgpr_alloc = %i\n", info->max_vgpr_alloc);
+   printf("    wave64_vgpr_alloc_granularity = %i\n", info->wave64_vgpr_alloc_granularity);
+
+   printf("Render backend info:\n");
+   printf("    pa_sc_tile_steering_override = 0x%x\n", info->pa_sc_tile_steering_override);
+   printf("    num_render_backends = %i\n", info->num_render_backends);
+   printf("    num_tile_pipes = %i\n", info->num_tile_pipes);
+   printf("    pipe_interleave_bytes = %i\n", info->pipe_interleave_bytes);
+   printf("    enabled_rb_mask = 0x%x\n", info->enabled_rb_mask);
+   printf("    max_alignment = %u\n", (unsigned)info->max_alignment);
+   printf("    pbb_max_alloc_count = %u\n", info->pbb_max_alloc_count);
+
+   printf("GB_ADDR_CONFIG: 0x%08x\n", info->gb_addr_config);
+   if (info->chip_class >= GFX10) {
+      printf("    num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
+      printf("    pipe_interleave_size = %u\n",
+             256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config));
+      printf("    max_compressed_frags = %u\n",
+             1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config));
+   } else if (info->chip_class == GFX9) {
+      printf("    num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
+      printf("    pipe_interleave_size = %u\n",
+             256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX9(info->gb_addr_config));
+      printf("    max_compressed_frags = %u\n",
+             1 << G_0098F8_MAX_COMPRESSED_FRAGS(info->gb_addr_config));
+      printf("    bank_interleave_size = %u\n",
+             1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config));
+      printf("    num_banks = %u\n", 1 << G_0098F8_NUM_BANKS(info->gb_addr_config));
+      printf("    shader_engine_tile_size = %u\n",
+             16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config));
+      printf("    num_shader_engines = %u\n",
+             1 << G_0098F8_NUM_SHADER_ENGINES_GFX9(info->gb_addr_config));
+      printf("    num_gpus = %u (raw)\n", G_0098F8_NUM_GPUS_GFX9(info->gb_addr_config));
+      printf("    multi_gpu_tile_size = %u (raw)\n",
+             G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config));
+      printf("    num_rb_per_se = %u\n", 1 << G_0098F8_NUM_RB_PER_SE(info->gb_addr_config));
+      printf("    row_size = %u\n", 1024 << G_0098F8_ROW_SIZE(info->gb_addr_config));
+      printf("    num_lower_pipes = %u (raw)\n", G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config));
+      printf("    se_enable = %u (raw)\n", G_0098F8_SE_ENABLE(info->gb_addr_config));
+   } else {
+      printf("    num_pipes = %u\n", 1 << G_0098F8_NUM_PIPES(info->gb_addr_config));
+      printf("    pipe_interleave_size = %u\n",
+             256 << G_0098F8_PIPE_INTERLEAVE_SIZE_GFX6(info->gb_addr_config));
+      printf("    bank_interleave_size = %u\n",
+             1 << G_0098F8_BANK_INTERLEAVE_SIZE(info->gb_addr_config));
+      printf("    num_shader_engines = %u\n",
+             1 << G_0098F8_NUM_SHADER_ENGINES_GFX6(info->gb_addr_config));
+      printf("    shader_engine_tile_size = %u\n",
+             16 << G_0098F8_SHADER_ENGINE_TILE_SIZE(info->gb_addr_config));
+      printf("    num_gpus = %u (raw)\n", G_0098F8_NUM_GPUS_GFX6(info->gb_addr_config));
+      printf("    multi_gpu_tile_size = %u (raw)\n",
+             G_0098F8_MULTI_GPU_TILE_SIZE(info->gb_addr_config));
+      printf("    row_size = %u\n", 1024 << G_0098F8_ROW_SIZE(info->gb_addr_config));
+      printf("    num_lower_pipes = %u (raw)\n", G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config));
+   }
 }
 
-int
-ac_get_gs_table_depth(enum chip_class chip_class, enum radeon_family family)
+int ac_get_gs_table_depth(enum chip_class chip_class, enum radeon_family family)
 {
-       if (chip_class >= GFX9)
-               return -1;
-
-       switch (family) {
-       case CHIP_OLAND:
-       case CHIP_HAINAN:
-       case CHIP_KAVERI:
-       case CHIP_KABINI:
-       case CHIP_ICELAND:
-       case CHIP_CARRIZO:
-       case CHIP_STONEY:
-               return 16;
-       case CHIP_TAHITI:
-       case CHIP_PITCAIRN:
-       case CHIP_VERDE:
-       case CHIP_BONAIRE:
-       case CHIP_HAWAII:
-       case CHIP_TONGA:
-       case CHIP_FIJI:
-       case CHIP_POLARIS10:
-       case CHIP_POLARIS11:
-       case CHIP_POLARIS12:
-       case CHIP_VEGAM:
-               return 32;
-       default:
-               unreachable("Unknown GPU");
-       }
+   if (chip_class >= GFX9)
+      return -1;
+
+   switch (family) {
+   case CHIP_OLAND:
+   case CHIP_HAINAN:
+   case CHIP_KAVERI:
+   case CHIP_KABINI:
+   case CHIP_ICELAND:
+   case CHIP_CARRIZO:
+   case CHIP_STONEY:
+      return 16;
+   case CHIP_TAHITI:
+   case CHIP_PITCAIRN:
+   case CHIP_VERDE:
+   case CHIP_BONAIRE:
+   case CHIP_HAWAII:
+   case CHIP_TONGA:
+   case CHIP_FIJI:
+   case CHIP_POLARIS10:
+   case CHIP_POLARIS11:
+   case CHIP_POLARIS12:
+   case CHIP_VEGAM:
+      return 32;
+   default:
+      unreachable("Unknown GPU");
+   }
 }
 
-void
-ac_get_raster_config(struct radeon_info *info,
-                    uint32_t *raster_config_p,
-                    uint32_t *raster_config_1_p,
-                    uint32_t *se_tile_repeat_p)
+void ac_get_raster_config(struct radeon_info *info, uint32_t *raster_config_p,
+                          uint32_t *raster_config_1_p, uint32_t *se_tile_repeat_p)
 {
-       unsigned raster_config, raster_config_1, se_tile_repeat;
-
-       switch (info->family) {
-       /* 1 SE / 1 RB */
-       case CHIP_HAINAN:
-       case CHIP_KABINI:
-       case CHIP_STONEY:
-               raster_config = 0x00000000;
-               raster_config_1 = 0x00000000;
-               break;
-       /* 1 SE / 4 RBs */
-       case CHIP_VERDE:
-               raster_config = 0x0000124a;
-               raster_config_1 = 0x00000000;
-               break;
-       /* 1 SE / 2 RBs (Oland is special) */
-       case CHIP_OLAND:
-               raster_config = 0x00000082;
-               raster_config_1 = 0x00000000;
-               break;
-       /* 1 SE / 2 RBs */
-       case CHIP_KAVERI:
-       case CHIP_ICELAND:
-       case CHIP_CARRIZO:
-               raster_config = 0x00000002;
-               raster_config_1 = 0x00000000;
-               break;
-       /* 2 SEs / 4 RBs */
-       case CHIP_BONAIRE:
-       case CHIP_POLARIS11:
-       case CHIP_POLARIS12:
-               raster_config = 0x16000012;
-               raster_config_1 = 0x00000000;
-               break;
-       /* 2 SEs / 8 RBs */
-       case CHIP_TAHITI:
-       case CHIP_PITCAIRN:
-               raster_config = 0x2a00126a;
-               raster_config_1 = 0x00000000;
-               break;
-       /* 4 SEs / 8 RBs */
-       case CHIP_TONGA:
-       case CHIP_POLARIS10:
-               raster_config = 0x16000012;
-               raster_config_1 = 0x0000002a;
-               break;
-       /* 4 SEs / 16 RBs */
-       case CHIP_HAWAII:
-       case CHIP_FIJI:
-       case CHIP_VEGAM:
-               raster_config = 0x3a00161a;
-               raster_config_1 = 0x0000002e;
-               break;
-       default:
-               fprintf(stderr,
-                       "ac: Unknown GPU, using 0 for raster_config\n");
-               raster_config = 0x00000000;
-               raster_config_1 = 0x00000000;
-               break;
-       }
-
-       /* drm/radeon on Kaveri is buggy, so disable 1 RB to work around it.
-        * This decreases performance by up to 50% when the RB is the bottleneck.
-        */
-       if (info->family == CHIP_KAVERI && !info->is_amdgpu)
-               raster_config = 0x00000000;
-
-       /* Fiji: Old kernels have incorrect tiling config. This decreases
-        * RB performance by 25%. (it disables 1 RB in the second packer)
-        */
-       if (info->family == CHIP_FIJI &&
-           info->cik_macrotile_mode_array[0] == 0x000000e8) {
-               raster_config = 0x16000012;
-               raster_config_1 = 0x0000002a;
-       }
-
-       unsigned se_width = 8 << G_028350_SE_XSEL_GFX6(raster_config);
-       unsigned se_height = 8 << G_028350_SE_YSEL_GFX6(raster_config);
-
-       /* I don't know how to calculate this, though this is probably a good guess. */
-       se_tile_repeat = MAX2(se_width, se_height) * info->max_se;
-
-       *raster_config_p = raster_config;
-       *raster_config_1_p = raster_config_1;
-       if (se_tile_repeat_p)
-               *se_tile_repeat_p = se_tile_repeat;
+   unsigned raster_config, raster_config_1, se_tile_repeat;
+
+   switch (info->family) {
+   /* 1 SE / 1 RB */
+   case CHIP_HAINAN:
+   case CHIP_KABINI:
+   case CHIP_STONEY:
+      raster_config = 0x00000000;
+      raster_config_1 = 0x00000000;
+      break;
+   /* 1 SE / 4 RBs */
+   case CHIP_VERDE:
+      raster_config = 0x0000124a;
+      raster_config_1 = 0x00000000;
+      break;
+   /* 1 SE / 2 RBs (Oland is special) */
+   case CHIP_OLAND:
+      raster_config = 0x00000082;
+      raster_config_1 = 0x00000000;
+      break;
+   /* 1 SE / 2 RBs */
+   case CHIP_KAVERI:
+   case CHIP_ICELAND:
+   case CHIP_CARRIZO:
+      raster_config = 0x00000002;
+      raster_config_1 = 0x00000000;
+      break;
+   /* 2 SEs / 4 RBs */
+   case CHIP_BONAIRE:
+   case CHIP_POLARIS11:
+   case CHIP_POLARIS12:
+      raster_config = 0x16000012;
+      raster_config_1 = 0x00000000;
+      break;
+   /* 2 SEs / 8 RBs */
+   case CHIP_TAHITI:
+   case CHIP_PITCAIRN:
+      raster_config = 0x2a00126a;
+      raster_config_1 = 0x00000000;
+      break;
+   /* 4 SEs / 8 RBs */
+   case CHIP_TONGA:
+   case CHIP_POLARIS10:
+      raster_config = 0x16000012;
+      raster_config_1 = 0x0000002a;
+      break;
+   /* 4 SEs / 16 RBs */
+   case CHIP_HAWAII:
+   case CHIP_FIJI:
+   case CHIP_VEGAM:
+      raster_config = 0x3a00161a;
+      raster_config_1 = 0x0000002e;
+      break;
+   default:
+      fprintf(stderr, "ac: Unknown GPU, using 0 for raster_config\n");
+      raster_config = 0x00000000;
+      raster_config_1 = 0x00000000;
+      break;
+   }
+
+   /* drm/radeon on Kaveri is buggy, so disable 1 RB to work around it.
+    * This decreases performance by up to 50% when the RB is the bottleneck.
+    */
+   if (info->family == CHIP_KAVERI && !info->is_amdgpu)
+      raster_config = 0x00000000;
+
+   /* Fiji: Old kernels have incorrect tiling config. This decreases
+    * RB performance by 25%. (it disables 1 RB in the second packer)
+    */
+   if (info->family == CHIP_FIJI && info->cik_macrotile_mode_array[0] == 0x000000e8) {
+      raster_config = 0x16000012;
+      raster_config_1 = 0x0000002a;
+   }
+
+   unsigned se_width = 8 << G_028350_SE_XSEL_GFX6(raster_config);
+   unsigned se_height = 8 << G_028350_SE_YSEL_GFX6(raster_config);
+
+   /* I don't know how to calculate this, though this is probably a good guess. */
+   se_tile_repeat = MAX2(se_width, se_height) * info->max_se;
+
+   *raster_config_p = raster_config;
+   *raster_config_1_p = raster_config_1;
+   if (se_tile_repeat_p)
+      *se_tile_repeat_p = se_tile_repeat;
 }
 
-void
-ac_get_harvested_configs(struct radeon_info *info,
-                        unsigned raster_config,
-                        unsigned *cik_raster_config_1_p,
-                        unsigned *raster_config_se)
+void ac_get_harvested_configs(struct radeon_info *info, unsigned raster_config,
+                              unsigned *cik_raster_config_1_p, unsigned *raster_config_se)
 {
-       unsigned sh_per_se = MAX2(info->max_sh_per_se, 1);
-       unsigned num_se = MAX2(info->max_se, 1);
-       unsigned rb_mask = info->enabled_rb_mask;
-       unsigned num_rb = MIN2(info->num_render_backends, 16);
-       unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2);
-       unsigned rb_per_se = num_rb / num_se;
-       unsigned se_mask[4];
-       unsigned se;
-
-       se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask;
-       se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask;
-       se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask;
-       se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask;
-
-       assert(num_se == 1 || num_se == 2 || num_se == 4);
-       assert(sh_per_se == 1 || sh_per_se == 2);
-       assert(rb_per_pkr == 1 || rb_per_pkr == 2);
-
-
-       if (info->chip_class >= GFX7) {
-               unsigned raster_config_1 = *cik_raster_config_1_p;
-               if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) ||
-                                    (!se_mask[2] && !se_mask[3]))) {
-                       raster_config_1 &= C_028354_SE_PAIR_MAP;
-
-                       if (!se_mask[0] && !se_mask[1]) {
-                               raster_config_1 |=
-                                       S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3);
-                       } else {
-                               raster_config_1 |=
-                                       S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0);
-                       }
-                       *cik_raster_config_1_p = raster_config_1;
-               }
-       }
-
-       for (se = 0; se < num_se; se++) {
-               unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se);
-               unsigned pkr1_mask = pkr0_mask << rb_per_pkr;
-               int idx = (se / 2) * 2;
-
-               raster_config_se[se] = raster_config;
-               if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) {
-                       raster_config_se[se] &= C_028350_SE_MAP;
-
-                       if (!se_mask[idx]) {
-                               raster_config_se[se] |=
-                                       S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
-                       } else {
-                               raster_config_se[se] |=
-                                       S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
-                       }
-               }
-
-               pkr0_mask &= rb_mask;
-               pkr1_mask &= rb_mask;
-               if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) {
-                       raster_config_se[se] &= C_028350_PKR_MAP;
-
-                       if (!pkr0_mask) {
-                               raster_config_se[se] |=
-                                       S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_3);
-                       } else {
-                               raster_config_se[se] |=
-                                       S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_0);
-                       }
-               }
-
-               if (rb_per_se >= 2) {
-                       unsigned rb0_mask = 1 << (se * rb_per_se);
-                       unsigned rb1_mask = rb0_mask << 1;
-
-                       rb0_mask &= rb_mask;
-                       rb1_mask &= rb_mask;
-                       if (!rb0_mask || !rb1_mask) {
-                               raster_config_se[se] &= C_028350_RB_MAP_PKR0;
-
-                               if (!rb0_mask) {
-                                       raster_config_se[se] |=
-                                               S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_3);
-                               } else {
-                                       raster_config_se[se] |=
-                                               S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_0);
-                               }
-                       }
-
-                       if (rb_per_se > 2) {
-                               rb0_mask = 1 << (se * rb_per_se + rb_per_pkr);
-                               rb1_mask = rb0_mask << 1;
-                               rb0_mask &= rb_mask;
-                               rb1_mask &= rb_mask;
-                               if (!rb0_mask || !rb1_mask) {
-                                       raster_config_se[se] &= C_028350_RB_MAP_PKR1;
-
-                                       if (!rb0_mask) {
-                                               raster_config_se[se] |=
-                                                       S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_3);
-                                       } else {
-                                               raster_config_se[se] |=
-                                                       S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_0);
-                                       }
-                               }
-                       }
-               }
-       }
+   unsigned sh_per_se = MAX2(info->max_sh_per_se, 1);
+   unsigned num_se = MAX2(info->max_se, 1);
+   unsigned rb_mask = info->enabled_rb_mask;
+   unsigned num_rb = MIN2(info->num_render_backends, 16);
+   unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2);
+   unsigned rb_per_se = num_rb / num_se;
+   unsigned se_mask[4];
+   unsigned se;
+
+   se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask;
+   se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask;
+   se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask;
+   se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask;
+
+   assert(num_se == 1 || num_se == 2 || num_se == 4);
+   assert(sh_per_se == 1 || sh_per_se == 2);
+   assert(rb_per_pkr == 1 || rb_per_pkr == 2);
+
+   if (info->chip_class >= GFX7) {
+      unsigned raster_config_1 = *cik_raster_config_1_p;
+      if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) || (!se_mask[2] && !se_mask[3]))) {
+         raster_config_1 &= C_028354_SE_PAIR_MAP;
+
+         if (!se_mask[0] && !se_mask[1]) {
+            raster_config_1 |= S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3);
+         } else {
+            raster_config_1 |= S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0);
+         }
+         *cik_raster_config_1_p = raster_config_1;
+      }
+   }
+
+   for (se = 0; se < num_se; se++) {
+      unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se);
+      unsigned pkr1_mask = pkr0_mask << rb_per_pkr;
+      int idx = (se / 2) * 2;
+
+      raster_config_se[se] = raster_config;
+      if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) {
+         raster_config_se[se] &= C_028350_SE_MAP;
+
+         if (!se_mask[idx]) {
+            raster_config_se[se] |= S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
+         } else {
+            raster_config_se[se] |= S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
+         }
+      }
+
+      pkr0_mask &= rb_mask;
+      pkr1_mask &= rb_mask;
+      if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) {
+         raster_config_se[se] &= C_028350_PKR_MAP;
+
+         if (!pkr0_mask) {
+            raster_config_se[se] |= S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_3);
+         } else {
+            raster_config_se[se] |= S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_0);
+         }
+      }
+
+      if (rb_per_se >= 2) {
+         unsigned rb0_mask = 1 << (se * rb_per_se);
+         unsigned rb1_mask = rb0_mask << 1;
+
+         rb0_mask &= rb_mask;
+         rb1_mask &= rb_mask;
+         if (!rb0_mask || !rb1_mask) {
+            raster_config_se[se] &= C_028350_RB_MAP_PKR0;
+
+            if (!rb0_mask) {
+               raster_config_se[se] |= S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_3);
+            } else {
+               raster_config_se[se] |= S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_0);
+            }
+         }
+
+         if (rb_per_se > 2) {
+            rb0_mask = 1 << (se * rb_per_se + rb_per_pkr);
+            rb1_mask = rb0_mask << 1;
+            rb0_mask &= rb_mask;
+            rb1_mask &= rb_mask;
+            if (!rb0_mask || !rb1_mask) {
+               raster_config_se[se] &= C_028350_RB_MAP_PKR1;
+
+               if (!rb0_mask) {
+                  raster_config_se[se] |= S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_3);
+               } else {
+                  raster_config_se[se] |= S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_0);
+               }
+            }
+         }
+      }
+   }
 }
 
-unsigned ac_get_compute_resource_limits(struct radeon_info *info,
-                                       unsigned waves_per_threadgroup,
-                                       unsigned max_waves_per_sh,
-                                       unsigned threadgroups_per_cu)
+unsigned ac_get_compute_resource_limits(struct radeon_info *info, unsigned waves_per_threadgroup,
+                                        unsigned max_waves_per_sh, unsigned threadgroups_per_cu)
 {
-       unsigned compute_resource_limits =
-               S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0);
-
-       if (info->chip_class >= GFX7) {
-               unsigned num_cu_per_se = info->num_good_compute_units /
-                                        info->max_se;
-
-               /* Force even distribution on all SIMDs in CU if the workgroup
-                * size is 64. This has shown some good improvements if # of CUs
-                * per SE is not a multiple of 4.
-                */
-               if (num_cu_per_se % 4 && waves_per_threadgroup == 1)
-                       compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1);
-
-               assert(threadgroups_per_cu >= 1 && threadgroups_per_cu <= 8);
-               compute_resource_limits |= S_00B854_WAVES_PER_SH(max_waves_per_sh) |
-                                          S_00B854_CU_GROUP_COUNT(threadgroups_per_cu - 1);
-       } else {
-               /* GFX6 */
-               if (max_waves_per_sh) {
-                       unsigned limit_div16 = DIV_ROUND_UP(max_waves_per_sh, 16);
-                       compute_resource_limits |= S_00B854_WAVES_PER_SH_GFX6(limit_div16);
-               }
-       }
-       return compute_resource_limits;
+   unsigned compute_resource_limits = S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0);
+
+   if (info->chip_class >= GFX7) {
+      unsigned num_cu_per_se = info->num_good_compute_units / info->max_se;
+
+      /* Force even distribution on all SIMDs in CU if the workgroup
+       * size is 64. This has shown some good improvements if # of CUs
+       * per SE is not a multiple of 4.
+       */
+      if (num_cu_per_se % 4 && waves_per_threadgroup == 1)
+         compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1);
+
+      assert(threadgroups_per_cu >= 1 && threadgroups_per_cu <= 8);
+      compute_resource_limits |=
+         S_00B854_WAVES_PER_SH(max_waves_per_sh) | S_00B854_CU_GROUP_COUNT(threadgroups_per_cu - 1);
+   } else {
+      /* GFX6 */
+      if (max_waves_per_sh) {
+         unsigned limit_div16 = DIV_ROUND_UP(max_waves_per_sh, 16);
+         compute_resource_limits |= S_00B854_WAVES_PER_SH_GFX6(limit_div16);
+      }
+   }
+   return compute_resource_limits;
 }
index 70e53f16cb403cda05b1fc76fe69802a592f9c12..f6d4e621b58f5d7d6a4471da6fb4e9ec5002c9ca 100644 (file)
 #ifndef AC_GPU_INFO_H
 #define AC_GPU_INFO_H
 
+#include "amd_family.h"
+
+#include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
-#include <stdbool.h>
-#include "amd_family.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -38,186 +39,179 @@ extern "C" {
 struct amdgpu_gpu_info;
 
 struct radeon_info {
-       /* PCI info: domain:bus:dev:func */
-       uint32_t                    pci_domain;
-       uint32_t                    pci_bus;
-       uint32_t                    pci_dev;
-       uint32_t                    pci_func;
-
-       /* Device info. */
-       const char                  *name;
-       const char                  *marketing_name;
-       bool                        is_pro_graphics;
-       uint32_t                    pci_id;
-       uint32_t                    pci_rev_id;
-       enum radeon_family          family;
-       enum chip_class             chip_class;
-       uint32_t                    family_id;
-       uint32_t                    chip_external_rev;
-       uint32_t                    clock_crystal_freq;
-
-       /* Features. */
-       bool                        has_graphics; /* false if the chip is compute-only */
-       uint32_t                    num_rings[NUM_RING_TYPES];
-       uint32_t                    ib_pad_dw_mask[NUM_RING_TYPES];
-       bool                        has_clear_state;
-       bool                        has_distributed_tess;
-       bool                        has_dcc_constant_encode;
-       bool                        has_rbplus; /* if RB+ registers exist */
-       bool                        rbplus_allowed; /* if RB+ is allowed */
-       bool                        has_load_ctx_reg_pkt;
-       bool                        has_out_of_order_rast;
-       bool                        has_packed_math_16bit;
-       bool                        cpdma_prefetch_writes_memory;
-       bool                        has_gfx9_scissor_bug;
-       bool                        has_tc_compat_zrange_bug;
-       bool                        has_msaa_sample_loc_bug;
-       bool                        has_ls_vgpr_init_bug;
-
-       /* Display features. */
-       /* There are 2 display DCC codepaths, because display expects unaligned DCC. */
-       /* Disable RB and pipe alignment to skip the retile blit. (1 RB chips only) */
-       bool                        use_display_dcc_unaligned;
-       /* Allocate both aligned and unaligned DCC and use the retile blit. */
-       bool                        use_display_dcc_with_retile_blit;
-
-       /* Memory info. */
-       uint32_t                    pte_fragment_size;
-       uint32_t                    gart_page_size;
-       uint64_t                    gart_size;
-       uint64_t                    vram_size;
-       uint64_t                    vram_vis_size;
-       uint32_t                    vram_bit_width;
-       uint32_t                    vram_type;
-       unsigned                    gds_size;
-       unsigned                    gds_gfx_partition_size;
-       uint64_t                    max_alloc_size;
-       uint32_t                    min_alloc_size;
-       uint32_t                    address32_hi;
-       bool                        has_dedicated_vram;
-       bool                        has_l2_uncached;
-       bool                        r600_has_virtual_memory;
-       uint32_t                    num_sdp_interfaces;
-       uint32_t                    num_tcc_blocks;
-       uint32_t                    tcc_cache_line_size;
-       bool                        tcc_harvested;
-       unsigned                    pc_lines;
-       uint32_t                    lds_size_per_workgroup;
-       uint32_t                    lds_granularity;
-       uint32_t                    max_memory_clock;
-       uint32_t                    ce_ram_size;
-       uint32_t                    l1_cache_size;
-       uint32_t                    l2_cache_size;
-
-       /* CP info. */
-       bool                        gfx_ib_pad_with_type2;
-       unsigned                    ib_alignment; /* both start and size alignment */
-       uint32_t                    me_fw_version;
-       uint32_t                    me_fw_feature;
-       uint32_t                    pfp_fw_version;
-       uint32_t                    pfp_fw_feature;
-       uint32_t                    ce_fw_version;
-       uint32_t                    ce_fw_feature;
-
-       /* Multimedia info. */
-       bool                        has_hw_decode;
-       bool                        uvd_enc_supported;
-       uint32_t                    uvd_fw_version;
-       uint32_t                    vce_fw_version;
-       uint32_t                    vce_harvest_config;
-
-       /* Kernel & winsys capabilities. */
-       uint32_t                    drm_major; /* version */
-       uint32_t                    drm_minor;
-       uint32_t                    drm_patchlevel;
-       bool                        is_amdgpu;
-       bool                        has_userptr;
-       bool                        has_syncobj;
-       bool                        has_syncobj_wait_for_submit;
-       bool                        has_timeline_syncobj;
-       bool                        has_fence_to_handle;
-       bool                        has_ctx_priority;
-       bool                        has_local_buffers;
-       bool                        kernel_flushes_hdp_before_ib;
-       bool                        htile_cmask_support_1d_tiling;
-       bool                        si_TA_CS_BC_BASE_ADDR_allowed;
-       bool                        has_bo_metadata;
-       bool                        has_gpu_reset_status_query;
-       bool                        has_eqaa_surface_allocator;
-       bool                        has_format_bc1_through_bc7;
-       bool                        kernel_flushes_tc_l2_after_ib;
-       bool                        has_indirect_compute_dispatch;
-       bool                        has_unaligned_shader_loads;
-       bool                        has_sparse_vm_mappings;
-       bool                        has_2d_tiling;
-       bool                        has_read_registers_query;
-       bool                        has_gds_ordered_append;
-       bool                        has_scheduled_fence_dependency;
-       /* Whether SR-IOV is enabled or amdgpu.mcbp=1 was set on the kernel command line. */
-       bool                        mid_command_buffer_preemption_enabled;
-
-       /* Shader cores. */
-       uint32_t                    cu_mask[4][2];
-       uint32_t                    r600_max_quad_pipes; /* wave size / 16 */
-       uint32_t                    max_shader_clock;
-       uint32_t                    num_good_compute_units;
-       uint32_t                    max_good_cu_per_sa;
-       uint32_t                    min_good_cu_per_sa; /* min != max if SAs have different # of CUs */
-       uint32_t                    max_se; /* shader engines */
-       uint32_t                    max_sh_per_se; /* shader arrays per shader engine */
-       uint32_t                    max_wave64_per_simd;
-       uint32_t                    num_physical_sgprs_per_simd;
-       uint32_t                    num_physical_wave64_vgprs_per_simd;
-       uint32_t                    num_simd_per_compute_unit;
-       uint32_t                    min_sgpr_alloc;
-       uint32_t                    max_sgpr_alloc;
-       uint32_t                    sgpr_alloc_granularity;
-       uint32_t                    min_wave64_vgpr_alloc;
-       uint32_t                    max_vgpr_alloc;
-       uint32_t                    wave64_vgpr_alloc_granularity;
-       bool                        use_late_alloc; /* VS and GS: late pos/param allocation */
-
-       /* Render backends (color + depth blocks). */
-       uint32_t                    r300_num_gb_pipes;
-       uint32_t                    r300_num_z_pipes;
-       uint32_t                    r600_gb_backend_map; /* R600 harvest config */
-       bool                        r600_gb_backend_map_valid;
-       uint32_t                    r600_num_banks;
-       uint32_t                    gb_addr_config;
-       uint32_t                    pa_sc_tile_steering_override; /* CLEAR_STATE also sets this */
-       uint32_t                    num_render_backends;
-       uint32_t                    num_tile_pipes; /* pipe count from PIPE_CONFIG */
-       uint32_t                    pipe_interleave_bytes;
-       uint32_t                    enabled_rb_mask; /* GCN harvest config */
-       uint64_t                    max_alignment; /* from addrlib */
-       uint32_t                    pbb_max_alloc_count;
-
-       /* Tile modes. */
-       uint32_t                    si_tile_mode_array[32];
-       uint32_t                    cik_macrotile_mode_array[16];
+   /* PCI info: domain:bus:dev:func */
+   uint32_t pci_domain;
+   uint32_t pci_bus;
+   uint32_t pci_dev;
+   uint32_t pci_func;
+
+   /* Device info. */
+   const char *name;
+   const char *marketing_name;
+   bool is_pro_graphics;
+   uint32_t pci_id;
+   uint32_t pci_rev_id;
+   enum radeon_family family;
+   enum chip_class chip_class;
+   uint32_t family_id;
+   uint32_t chip_external_rev;
+   uint32_t clock_crystal_freq;
+
+   /* Features. */
+   bool has_graphics; /* false if the chip is compute-only */
+   uint32_t num_rings[NUM_RING_TYPES];
+   uint32_t ib_pad_dw_mask[NUM_RING_TYPES];
+   bool has_clear_state;
+   bool has_distributed_tess;
+   bool has_dcc_constant_encode;
+   bool has_rbplus;     /* if RB+ registers exist */
+   bool rbplus_allowed; /* if RB+ is allowed */
+   bool has_load_ctx_reg_pkt;
+   bool has_out_of_order_rast;
+   bool has_packed_math_16bit;
+   bool cpdma_prefetch_writes_memory;
+   bool has_gfx9_scissor_bug;
+   bool has_tc_compat_zrange_bug;
+   bool has_msaa_sample_loc_bug;
+   bool has_ls_vgpr_init_bug;
+
+   /* Display features. */
+   /* There are 2 display DCC codepaths, because display expects unaligned DCC. */
+   /* Disable RB and pipe alignment to skip the retile blit. (1 RB chips only) */
+   bool use_display_dcc_unaligned;
+   /* Allocate both aligned and unaligned DCC and use the retile blit. */
+   bool use_display_dcc_with_retile_blit;
+
+   /* Memory info. */
+   uint32_t pte_fragment_size;
+   uint32_t gart_page_size;
+   uint64_t gart_size;
+   uint64_t vram_size;
+   uint64_t vram_vis_size;
+   uint32_t vram_bit_width;
+   uint32_t vram_type;
+   unsigned gds_size;
+   unsigned gds_gfx_partition_size;
+   uint64_t max_alloc_size;
+   uint32_t min_alloc_size;
+   uint32_t address32_hi;
+   bool has_dedicated_vram;
+   bool has_l2_uncached;
+   bool r600_has_virtual_memory;
+   uint32_t num_sdp_interfaces;
+   uint32_t num_tcc_blocks;
+   uint32_t tcc_cache_line_size;
+   bool tcc_harvested;
+   unsigned pc_lines;
+   uint32_t lds_size_per_workgroup;
+   uint32_t lds_granularity;
+   uint32_t max_memory_clock;
+   uint32_t ce_ram_size;
+   uint32_t l1_cache_size;
+   uint32_t l2_cache_size;
+
+   /* CP info. */
+   bool gfx_ib_pad_with_type2;
+   unsigned ib_alignment; /* both start and size alignment */
+   uint32_t me_fw_version;
+   uint32_t me_fw_feature;
+   uint32_t pfp_fw_version;
+   uint32_t pfp_fw_feature;
+   uint32_t ce_fw_version;
+   uint32_t ce_fw_feature;
+
+   /* Multimedia info. */
+   bool has_hw_decode;
+   bool uvd_enc_supported;
+   uint32_t uvd_fw_version;
+   uint32_t vce_fw_version;
+   uint32_t vce_harvest_config;
+
+   /* Kernel & winsys capabilities. */
+   uint32_t drm_major; /* version */
+   uint32_t drm_minor;
+   uint32_t drm_patchlevel;
+   bool is_amdgpu;
+   bool has_userptr;
+   bool has_syncobj;
+   bool has_syncobj_wait_for_submit;
+   bool has_timeline_syncobj;
+   bool has_fence_to_handle;
+   bool has_ctx_priority;
+   bool has_local_buffers;
+   bool kernel_flushes_hdp_before_ib;
+   bool htile_cmask_support_1d_tiling;
+   bool si_TA_CS_BC_BASE_ADDR_allowed;
+   bool has_bo_metadata;
+   bool has_gpu_reset_status_query;
+   bool has_eqaa_surface_allocator;
+   bool has_format_bc1_through_bc7;
+   bool kernel_flushes_tc_l2_after_ib;
+   bool has_indirect_compute_dispatch;
+   bool has_unaligned_shader_loads;
+   bool has_sparse_vm_mappings;
+   bool has_2d_tiling;
+   bool has_read_registers_query;
+   bool has_gds_ordered_append;
+   bool has_scheduled_fence_dependency;
+   /* Whether SR-IOV is enabled or amdgpu.mcbp=1 was set on the kernel command line. */
+   bool mid_command_buffer_preemption_enabled;
+
+   /* Shader cores. */
+   uint32_t cu_mask[4][2];
+   uint32_t r600_max_quad_pipes; /* wave size / 16 */
+   uint32_t max_shader_clock;
+   uint32_t num_good_compute_units;
+   uint32_t max_good_cu_per_sa;
+   uint32_t min_good_cu_per_sa; /* min != max if SAs have different # of CUs */
+   uint32_t max_se;             /* shader engines */
+   uint32_t max_sh_per_se;      /* shader arrays per shader engine */
+   uint32_t max_wave64_per_simd;
+   uint32_t num_physical_sgprs_per_simd;
+   uint32_t num_physical_wave64_vgprs_per_simd;
+   uint32_t num_simd_per_compute_unit;
+   uint32_t min_sgpr_alloc;
+   uint32_t max_sgpr_alloc;
+   uint32_t sgpr_alloc_granularity;
+   uint32_t min_wave64_vgpr_alloc;
+   uint32_t max_vgpr_alloc;
+   uint32_t wave64_vgpr_alloc_granularity;
+   bool use_late_alloc; /* VS and GS: late pos/param allocation */
+
+   /* Render backends (color + depth blocks). */
+   uint32_t r300_num_gb_pipes;
+   uint32_t r300_num_z_pipes;
+   uint32_t r600_gb_backend_map; /* R600 harvest config */
+   bool r600_gb_backend_map_valid;
+   uint32_t r600_num_banks;
+   uint32_t gb_addr_config;
+   uint32_t pa_sc_tile_steering_override; /* CLEAR_STATE also sets this */
+   uint32_t num_render_backends;
+   uint32_t num_tile_pipes; /* pipe count from PIPE_CONFIG */
+   uint32_t pipe_interleave_bytes;
+   uint32_t enabled_rb_mask; /* GCN harvest config */
+   uint64_t max_alignment;   /* from addrlib */
+   uint32_t pbb_max_alloc_count;
+
+   /* Tile modes. */
+   uint32_t si_tile_mode_array[32];
+   uint32_t cik_macrotile_mode_array[16];
 };
 
-bool ac_query_gpu_info(int fd, void *dev_p,
-                      struct radeon_info *info,
-                      struct amdgpu_gpu_info *amdinfo);
+bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
+                       struct amdgpu_gpu_info *amdinfo);
 
 void ac_compute_driver_uuid(char *uuid, size_t size);
 
 void ac_compute_device_uuid(struct radeon_info *info, char *uuid, size_t size);
 void ac_print_gpu_info(struct radeon_info *info);
 int ac_get_gs_table_depth(enum chip_class chip_class, enum radeon_family family);
-void ac_get_raster_config(struct radeon_info *info,
-                         uint32_t *raster_config_p,
-                         uint32_t *raster_config_1_p,
-                         uint32_t *se_tile_repeat_p);
-void ac_get_harvested_configs(struct radeon_info *info,
-                             unsigned raster_config,
-                             unsigned *cik_raster_config_1_p,
-                             unsigned *raster_config_se);
-unsigned ac_get_compute_resource_limits(struct radeon_info *info,
-                                       unsigned waves_per_threadgroup,
-                                       unsigned max_waves_per_sh,
-                                       unsigned threadgroups_per_cu);
+void ac_get_raster_config(struct radeon_info *info, uint32_t *raster_config_p,
+                          uint32_t *raster_config_1_p, uint32_t *se_tile_repeat_p);
+void ac_get_harvested_configs(struct radeon_info *info, unsigned raster_config,
+                              unsigned *cik_raster_config_1_p, unsigned *raster_config_se);
+unsigned ac_get_compute_resource_limits(struct radeon_info *info, unsigned waves_per_threadgroup,
+                                        unsigned max_waves_per_sh, unsigned threadgroups_per_cu);
 
 #ifdef __cplusplus
 }
index e512b8f73275686b228297602b94c6b876667443..8a9cd7c7a6e43454eccfcb1aa879c7fdb0dfc1e4 100644 (file)
 
 #include "ac_rtld.h"
 
+#include "ac_binary.h"
+#include "ac_gpu_info.h"
+#include "util/u_dynarray.h"
+#include "util/u_math.h"
+
 #include <gelf.h>
 #include <libelf.h>
 #include <stdarg.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include "ac_binary.h"
-#include "ac_gpu_info.h"
-#include "util/u_dynarray.h"
-#include "util/u_math.h"
-
 // Old distributions may not have this enum constant
 #define MY_EM_AMDGPU 224
 
 #endif
 
 #ifndef R_AMDGPU_NONE
-#define R_AMDGPU_NONE 0
-#define R_AMDGPU_ABS32_LO 1
-#define R_AMDGPU_ABS32_HI 2
-#define R_AMDGPU_ABS64 3
-#define R_AMDGPU_REL32 4
-#define R_AMDGPU_REL64 5
-#define R_AMDGPU_ABS32 6
-#define R_AMDGPU_GOTPCREL 7
+#define R_AMDGPU_NONE          0
+#define R_AMDGPU_ABS32_LO      1
+#define R_AMDGPU_ABS32_HI      2
+#define R_AMDGPU_ABS64         3
+#define R_AMDGPU_REL32         4
+#define R_AMDGPU_REL64         5
+#define R_AMDGPU_ABS32         6
+#define R_AMDGPU_GOTPCREL      7
 #define R_AMDGPU_GOTPCREL32_LO 8
 #define R_AMDGPU_GOTPCREL32_HI 9
-#define R_AMDGPU_REL32_LO 10
-#define R_AMDGPU_REL32_HI 11
-#define R_AMDGPU_RELATIVE64 13
+#define R_AMDGPU_REL32_LO      10
+#define R_AMDGPU_REL32_HI      11
+#define R_AMDGPU_RELATIVE64    13
 #endif
 
 /* For the UMR disassembler. */
-#define DEBUGGER_END_OF_CODE_MARKER    0xbf9f0000 /* invalid instruction */
-#define DEBUGGER_NUM_MARKERS           5
+#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
+#define DEBUGGER_NUM_MARKERS        5
 
 struct ac_rtld_section {
-       bool is_rx : 1;
-       bool is_pasted_text : 1;
-       uint64_t offset;
-       const char *name;
+   bool is_rx : 1;
+   bool is_pasted_text : 1;
+   uint64_t offset;
+   const char *name;
 };
 
 struct ac_rtld_part {
-       Elf *elf;
-       struct ac_rtld_section *sections;
-       unsigned num_sections;
+   Elf *elf;
+   struct ac_rtld_section *sections;
+   unsigned num_sections;
 };
 
 static void report_erroraf(const char *fmt, va_list va)
 {
-       char *msg;
-       int ret = vasprintf(&msg, fmt, va);
-       if (ret < 0)
-               msg = "(vasprintf failed)";
+   char *msg;
+   int ret = vasprintf(&msg, fmt, va);
+   if (ret < 0)
+      msg = "(vasprintf failed)";
 
-       fprintf(stderr, "ac_rtld error: %s\n", msg);
+   fprintf(stderr, "ac_rtld error: %s\n", msg);
 
-       if (ret >= 0)
-               free(msg);
+   if (ret >= 0)
+      free(msg);
 }
 
 static void report_errorf(const char *fmt, ...) PRINTFLIKE(1, 2);
 
 static void report_errorf(const char *fmt, ...)
 {
-       va_list va;
-       va_start(va, fmt);
-       report_erroraf(fmt, va);
-       va_end(va);
+   va_list va;
+   va_start(va, fmt);
+   report_erroraf(fmt, va);
+   va_end(va);
 }
 
 static void report_elf_errorf(const char *fmt, ...) PRINTFLIKE(1, 2);
 
 static void report_elf_errorf(const char *fmt, ...)
 {
-       va_list va;
-       va_start(va, fmt);
-       report_erroraf(fmt, va);
-       va_end(va);
+   va_list va;
+   va_start(va, fmt);
+   report_erroraf(fmt, va);
+   va_end(va);
 
-       fprintf(stderr, "ELF error: %s\n", elf_errmsg(elf_errno()));
+   fprintf(stderr, "ELF error: %s\n", elf_errmsg(elf_errno()));
 }
 
 /**
@@ -119,54 +119,53 @@ static void report_elf_errorf(const char *fmt, ...)
  * \p part_idx.
  */
 static const struct ac_rtld_symbol *find_symbol(const struct util_dynarray *symbols,
-                                               const char *name, unsigned part_idx)
+                                                const char *name, unsigned part_idx)
 {
-       util_dynarray_foreach(symbols, struct ac_rtld_symbol, symbol) {
-               if ((symbol->part_idx == ~0u || symbol->part_idx == part_idx) &&
-                   !strcmp(name, symbol->name))
-                       return symbol;
-       }
-       return 0;
+   util_dynarray_foreach (symbols, struct ac_rtld_symbol, symbol) {
+      if ((symbol->part_idx == ~0u || symbol->part_idx == part_idx) && !strcmp(name, symbol->name))
+         return symbol;
+   }
+   return 0;
 }
 
 static int compare_symbol_by_align(const void *lhsp, const void *rhsp)
 {
-       const struct ac_rtld_symbol *lhs = lhsp;
-       const struct ac_rtld_symbol *rhs = rhsp;
-       if (rhs->align > lhs->align)
-               return 1;
-       if (rhs->align < lhs->align)
-               return -1;
-       return 0;
+   const struct ac_rtld_symbol *lhs = lhsp;
+   const struct ac_rtld_symbol *rhs = rhsp;
+   if (rhs->align > lhs->align)
+      return 1;
+   if (rhs->align < lhs->align)
+      return -1;
+   return 0;
 }
 
 /**
  * Sort the given symbol list by decreasing alignment and assign offsets.
  */
 static bool layout_symbols(struct ac_rtld_symbol *symbols, unsigned num_symbols,
-                          uint64_t *ptotal_size)
+                           uint64_t *ptotal_size)
 {
-       qsort(symbols, num_symbols, sizeof(*symbols), compare_symbol_by_align);
+   qsort(symbols, num_symbols, sizeof(*symbols), compare_symbol_by_align);
 
-       uint64_t total_size = *ptotal_size;
+   uint64_t total_size = *ptotal_size;
 
-       for (unsigned i = 0; i < num_symbols; ++i) {
-               struct ac_rtld_symbol *s = &symbols[i];
-               assert(util_is_power_of_two_nonzero(s->align));
+   for (unsigned i = 0; i < num_symbols; ++i) {
+      struct ac_rtld_symbol *s = &symbols[i];
+      assert(util_is_power_of_two_nonzero(s->align));
 
-               total_size = align64(total_size, s->align);
-               s->offset = total_size;
+      total_size = align64(total_size, s->align);
+      s->offset = total_size;
 
-               if (total_size + s->size < total_size) {
-                       report_errorf("%s: size overflow", __FUNCTION__);
-                       return false;
-               }
+      if (total_size + s->size < total_size) {
+         report_errorf("%s: size overflow", __FUNCTION__);
+         return false;
+      }
 
-               total_size += s->size;
-       }
+      total_size += s->size;
+   }
 
-       *ptotal_size = total_size;
-       return true;
+   *ptotal_size = total_size;
+   return true;
 }
 
 /**
@@ -175,71 +174,68 @@ static bool layout_symbols(struct ac_rtld_symbol *symbols, unsigned num_symbols,
  *
  * Shared LDS symbols are filtered out.
  */
-static bool read_private_lds_symbols(struct ac_rtld_binary *binary,
-                                    unsigned part_idx,
-                                    Elf_Scn *section,
-                                    uint32_t *lds_end_align)
+static bool read_private_lds_symbols(struct ac_rtld_binary *binary, unsigned part_idx,
+                                     Elf_Scn *section, uint32_t *lds_end_align)
 {
-#define report_if(cond) \
-       do { \
-               if ((cond)) { \
-                       report_errorf(#cond); \
-                       return false; \
-               } \
-       } while (false)
-#define report_elf_if(cond) \
-       do { \
-               if ((cond)) { \
-                       report_elf_errorf(#cond); \
-                       return false; \
-               } \
-       } while (false)
-
-       struct ac_rtld_part *part = &binary->parts[part_idx];
-       Elf64_Shdr *shdr = elf64_getshdr(section);
-       uint32_t strtabidx = shdr->sh_link;
-       Elf_Data *symbols_data = elf_getdata(section, NULL);
-       report_elf_if(!symbols_data);
-
-       const Elf64_Sym *symbol = symbols_data->d_buf;
-       size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym);
-
-       for (size_t j = 0; j < num_symbols; ++j, ++symbol) {
-               struct ac_rtld_symbol s = {};
-
-               if (ELF64_ST_TYPE(symbol->st_info) == STT_AMDGPU_LDS) {
-                       /* old-style LDS symbols from initial prototype -- remove eventually */
-                       s.align = MIN2(1u << (symbol->st_other >> 3), 1u << 16);
-               } else if (symbol->st_shndx == SHN_AMDGPU_LDS) {
-                       s.align = MIN2(symbol->st_value, 1u << 16);
-                       report_if(!util_is_power_of_two_nonzero(s.align));
-               } else
-                       continue;
-
-               report_if(symbol->st_size > 1u << 29);
-
-               s.name = elf_strptr(part->elf, strtabidx, symbol->st_name);
-               s.size = symbol->st_size;
-               s.part_idx = part_idx;
-
-               if (!strcmp(s.name, "__lds_end")) {
-                       report_elf_if(s.size != 0);
-                       *lds_end_align = MAX2(*lds_end_align, s.align);
-                       continue;
-               }
-
-               const struct ac_rtld_symbol *shared =
-                       find_symbol(&binary->lds_symbols, s.name, part_idx);
-               if (shared) {
-                       report_elf_if(s.align > shared->align);
-                       report_elf_if(s.size > shared->size);
-                       continue;
-               }
-
-               util_dynarray_append(&binary->lds_symbols, struct ac_rtld_symbol, s);
-       }
-
-       return true;
+#define report_if(cond)                                                                            \
+   do {                                                                                            \
+      if ((cond)) {                                                                                \
+         report_errorf(#cond);                                                                     \
+         return false;                                                                             \
+      }                                                                                            \
+   } while (false)
+#define report_elf_if(cond)                                                                        \
+   do {                                                                                            \
+      if ((cond)) {                                                                                \
+         report_elf_errorf(#cond);                                                                 \
+         return false;                                                                             \
+      }                                                                                            \
+   } while (false)
+
+   struct ac_rtld_part *part = &binary->parts[part_idx];
+   Elf64_Shdr *shdr = elf64_getshdr(section);
+   uint32_t strtabidx = shdr->sh_link;
+   Elf_Data *symbols_data = elf_getdata(section, NULL);
+   report_elf_if(!symbols_data);
+
+   const Elf64_Sym *symbol = symbols_data->d_buf;
+   size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym);
+
+   for (size_t j = 0; j < num_symbols; ++j, ++symbol) {
+      struct ac_rtld_symbol s = {};
+
+      if (ELF64_ST_TYPE(symbol->st_info) == STT_AMDGPU_LDS) {
+         /* old-style LDS symbols from initial prototype -- remove eventually */
+         s.align = MIN2(1u << (symbol->st_other >> 3), 1u << 16);
+      } else if (symbol->st_shndx == SHN_AMDGPU_LDS) {
+         s.align = MIN2(symbol->st_value, 1u << 16);
+         report_if(!util_is_power_of_two_nonzero(s.align));
+      } else
+         continue;
+
+      report_if(symbol->st_size > 1u << 29);
+
+      s.name = elf_strptr(part->elf, strtabidx, symbol->st_name);
+      s.size = symbol->st_size;
+      s.part_idx = part_idx;
+
+      if (!strcmp(s.name, "__lds_end")) {
+         report_elf_if(s.size != 0);
+         *lds_end_align = MAX2(*lds_end_align, s.align);
+         continue;
+      }
+
+      const struct ac_rtld_symbol *shared = find_symbol(&binary->lds_symbols, s.name, part_idx);
+      if (shared) {
+         report_elf_if(s.align > shared->align);
+         report_elf_if(s.size > shared->size);
+         continue;
+      }
+
+      util_dynarray_append(&binary->lds_symbols, struct ac_rtld_symbol, s);
+   }
+
+   return true;
 
 #undef report_if
 #undef report_elf_if
@@ -251,486 +247,476 @@ static bool read_private_lds_symbols(struct ac_rtld_binary *binary,
  * \param binary the uninitialized struct
  * \param i binary opening parameters
  */
-bool ac_rtld_open(struct ac_rtld_binary *binary,
-                 struct ac_rtld_open_info i)
+bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i)
 {
-       /* One of the libelf implementations
-        * (http://www.mr511.de/software/english.htm) requires calling
-        * elf_version() before elf_memory().
-        */
-       elf_version(EV_CURRENT);
-
-       memset(binary, 0, sizeof(*binary));
-       memcpy(&binary->options, &i.options, sizeof(binary->options));
-       binary->wave_size = i.wave_size;
-       binary->num_parts = i.num_parts;
-       binary->parts = calloc(sizeof(*binary->parts), i.num_parts);
-       if (!binary->parts)
-               return false;
-
-       uint64_t pasted_text_size = 0;
-       uint64_t rx_align = 1;
-       uint64_t rx_size = 0;
-       uint64_t exec_size = 0;
-
-#define report_if(cond) \
-       do { \
-               if ((cond)) { \
-                       report_errorf(#cond); \
-                       goto fail; \
-               } \
-       } while (false)
-#define report_elf_if(cond) \
-       do { \
-               if ((cond)) { \
-                       report_elf_errorf(#cond); \
-                       goto fail; \
-               } \
-       } while (false)
-
-       /* Copy and layout shared LDS symbols. */
-       if (i.num_shared_lds_symbols) {
-               if (!util_dynarray_resize(&binary->lds_symbols, struct ac_rtld_symbol,
-                                         i.num_shared_lds_symbols))
-                       goto fail;
-
-               memcpy(binary->lds_symbols.data, i.shared_lds_symbols, binary->lds_symbols.size);
-       }
-
-       util_dynarray_foreach(&binary->lds_symbols, struct ac_rtld_symbol, symbol)
-               symbol->part_idx = ~0u;
-
-       unsigned max_lds_size = 64 * 1024;
-
-       if (i.info->chip_class == GFX6 ||
-           (i.shader_type != MESA_SHADER_COMPUTE &&
-            i.shader_type != MESA_SHADER_FRAGMENT))
-               max_lds_size = 32 * 1024;
-
-       uint64_t shared_lds_size = 0;
-       if (!layout_symbols(binary->lds_symbols.data, i.num_shared_lds_symbols, &shared_lds_size))
-               goto fail;
-
-       if (shared_lds_size > max_lds_size) {
-               fprintf(stderr, "ac_rtld error(1): too much LDS (used = %u, max = %u)\n",
-                       (unsigned)shared_lds_size, max_lds_size);
-               goto fail;
-       }
-       binary->lds_size = shared_lds_size;
-
-       /* First pass over all parts: open ELFs, pre-determine the placement of
-        * sections in the memory image, and collect and layout private LDS symbols. */
-       uint32_t lds_end_align = 0;
-
-       if (binary->options.halt_at_entry)
-               pasted_text_size += 4;
-
-       for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) {
-               struct ac_rtld_part *part = &binary->parts[part_idx];
-               unsigned part_lds_symbols_begin =
-                       util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol);
-
-               part->elf = elf_memory((char *)i.elf_ptrs[part_idx], i.elf_sizes[part_idx]);
-               report_elf_if(!part->elf);
-
-               const Elf64_Ehdr *ehdr = elf64_getehdr(part->elf);
-               report_elf_if(!ehdr);
-               report_if(ehdr->e_machine != MY_EM_AMDGPU);
-
-               size_t section_str_index;
-               size_t num_shdrs;
-               report_elf_if(elf_getshdrstrndx(part->elf, &section_str_index) < 0);
-               report_elf_if(elf_getshdrnum(part->elf, &num_shdrs) < 0);
-
-               part->num_sections = num_shdrs;
-               part->sections = calloc(sizeof(*part->sections), num_shdrs);
-               report_if(!part->sections);
-
-               Elf_Scn *section = NULL;
-               while ((section = elf_nextscn(part->elf, section))) {
-                       Elf64_Shdr *shdr = elf64_getshdr(section);
-                       struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)];
-                       s->name = elf_strptr(part->elf, section_str_index, shdr->sh_name);
-                       report_elf_if(!s->name);
-
-                       /* Cannot actually handle linked objects yet */
-                       report_elf_if(shdr->sh_addr != 0);
-
-                       /* Alignment must be 0 or a power of two */
-                       report_elf_if(shdr->sh_addralign & (shdr->sh_addralign - 1));
-                       uint64_t sh_align = MAX2(shdr->sh_addralign, 1);
-
-                       if (shdr->sh_flags & SHF_ALLOC &&
-                           shdr->sh_type != SHT_NOTE) {
-                               report_if(shdr->sh_flags & SHF_WRITE);
-
-                               s->is_rx = true;
-
-                               if (shdr->sh_flags & SHF_EXECINSTR) {
-                                       report_elf_if(shdr->sh_size & 3);
-
-                                       if (!strcmp(s->name, ".text"))
-                                               s->is_pasted_text = true;
-
-                                       exec_size += shdr->sh_size;
-                               }
-
-                               if (s->is_pasted_text) {
-                                       s->offset = pasted_text_size;
-                                       pasted_text_size += shdr->sh_size;
-                               } else {
-                                       rx_align = align(rx_align, sh_align);
-                                       rx_size = align(rx_size, sh_align);
-                                       s->offset = rx_size;
-                                       rx_size += shdr->sh_size;
-                               }
-                       } else if (shdr->sh_type == SHT_SYMTAB) {
-                               if (!read_private_lds_symbols(binary, part_idx, section, &lds_end_align))
-                                       goto fail;
-                       }
-               }
-
-               uint64_t part_lds_size = shared_lds_size;
-               if (!layout_symbols(
-                       util_dynarray_element(&binary->lds_symbols, struct ac_rtld_symbol, part_lds_symbols_begin),
-                       util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol) - part_lds_symbols_begin,
-                       &part_lds_size))
-                       goto fail;
-               binary->lds_size = MAX2(binary->lds_size, part_lds_size);
-       }
-
-       binary->rx_end_markers = pasted_text_size;
-       pasted_text_size += 4 * DEBUGGER_NUM_MARKERS;
-
-       /* __lds_end is a special symbol that points at the end of the memory
-        * occupied by other LDS symbols. Its alignment is taken as the
-        * maximum of its alignment over all shader parts where it occurs.
-        */
-       if (lds_end_align) {
-               binary->lds_size = align(binary->lds_size, lds_end_align);
-
-               struct ac_rtld_symbol *lds_end =
-                       util_dynarray_grow(&binary->lds_symbols, struct ac_rtld_symbol, 1);
-               lds_end->name = "__lds_end";
-               lds_end->size = 0;
-               lds_end->align = lds_end_align;
-               lds_end->offset = binary->lds_size;
-               lds_end->part_idx = ~0u;
-       }
-
-       if (binary->lds_size > max_lds_size) {
-               fprintf(stderr, "ac_rtld error(2): too much LDS (used = %u, max = %u)\n",
-                       (unsigned)binary->lds_size, max_lds_size);
-               goto fail;
-       }
-
-       /* Second pass: Adjust offsets of non-pasted text sections. */
-       binary->rx_size = pasted_text_size;
-       binary->rx_size = align(binary->rx_size, rx_align);
-
-       for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) {
-               struct ac_rtld_part *part = &binary->parts[part_idx];
-               size_t num_shdrs;
-               elf_getshdrnum(part->elf, &num_shdrs);
-
-               for (unsigned j = 0; j < num_shdrs; ++j) {
-                       struct ac_rtld_section *s = &part->sections[j];
-                       if (s->is_rx && !s->is_pasted_text)
-                               s->offset += binary->rx_size;
-               }
-       }
-
-       binary->rx_size += rx_size;
-       binary->exec_size = exec_size;
-
-       if (i.info->chip_class >= GFX10) {
-               /* In gfx10, the SQ fetches up to 3 cache lines of 16 dwords
-                * ahead of the PC, configurable by SH_MEM_CONFIG and
-                * S_INST_PREFETCH. This can cause two issues:
-                *
-                * (1) Crossing a page boundary to an unmapped page. The logic
-                *     does not distinguish between a required fetch and a "mere"
-                *     prefetch and will fault.
-                *
-                * (2) Prefetching instructions that will be changed for a
-                *     different shader.
-                *
-                * (2) is not currently an issue because we flush the I$ at IB
-                * boundaries, but (1) needs to be addressed. Due to buffer
-                * suballocation, we just play it safe.
-                */
-               binary->rx_size = align(binary->rx_size + 3 * 64, 64);
-       }
-
-       return true;
+   /* One of the libelf implementations
+    * (http://www.mr511.de/software/english.htm) requires calling
+    * elf_version() before elf_memory().
+    */
+   elf_version(EV_CURRENT);
+
+   memset(binary, 0, sizeof(*binary));
+   memcpy(&binary->options, &i.options, sizeof(binary->options));
+   binary->wave_size = i.wave_size;
+   binary->num_parts = i.num_parts;
+   binary->parts = calloc(sizeof(*binary->parts), i.num_parts);
+   if (!binary->parts)
+      return false;
+
+   uint64_t pasted_text_size = 0;
+   uint64_t rx_align = 1;
+   uint64_t rx_size = 0;
+   uint64_t exec_size = 0;
+
+#define report_if(cond)                                                                            \
+   do {                                                                                            \
+      if ((cond)) {                                                                                \
+         report_errorf(#cond);                                                                     \
+         goto fail;                                                                                \
+      }                                                                                            \
+   } while (false)
+#define report_elf_if(cond)                                                                        \
+   do {                                                                                            \
+      if ((cond)) {                                                                                \
+         report_elf_errorf(#cond);                                                                 \
+         goto fail;                                                                                \
+      }                                                                                            \
+   } while (false)
+
+   /* Copy and layout shared LDS symbols. */
+   if (i.num_shared_lds_symbols) {
+      if (!util_dynarray_resize(&binary->lds_symbols, struct ac_rtld_symbol,
+                                i.num_shared_lds_symbols))
+         goto fail;
+
+      memcpy(binary->lds_symbols.data, i.shared_lds_symbols, binary->lds_symbols.size);
+   }
+
+   util_dynarray_foreach (&binary->lds_symbols, struct ac_rtld_symbol, symbol)
+      symbol->part_idx = ~0u;
+
+   unsigned max_lds_size = 64 * 1024;
+
+   if (i.info->chip_class == GFX6 ||
+       (i.shader_type != MESA_SHADER_COMPUTE && i.shader_type != MESA_SHADER_FRAGMENT))
+      max_lds_size = 32 * 1024;
+
+   uint64_t shared_lds_size = 0;
+   if (!layout_symbols(binary->lds_symbols.data, i.num_shared_lds_symbols, &shared_lds_size))
+      goto fail;
+
+   if (shared_lds_size > max_lds_size) {
+      fprintf(stderr, "ac_rtld error(1): too much LDS (used = %u, max = %u)\n",
+              (unsigned)shared_lds_size, max_lds_size);
+      goto fail;
+   }
+   binary->lds_size = shared_lds_size;
+
+   /* First pass over all parts: open ELFs, pre-determine the placement of
+    * sections in the memory image, and collect and layout private LDS symbols. */
+   uint32_t lds_end_align = 0;
+
+   if (binary->options.halt_at_entry)
+      pasted_text_size += 4;
+
+   for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) {
+      struct ac_rtld_part *part = &binary->parts[part_idx];
+      unsigned part_lds_symbols_begin =
+         util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol);
+
+      part->elf = elf_memory((char *)i.elf_ptrs[part_idx], i.elf_sizes[part_idx]);
+      report_elf_if(!part->elf);
+
+      const Elf64_Ehdr *ehdr = elf64_getehdr(part->elf);
+      report_elf_if(!ehdr);
+      report_if(ehdr->e_machine != MY_EM_AMDGPU);
+
+      size_t section_str_index;
+      size_t num_shdrs;
+      report_elf_if(elf_getshdrstrndx(part->elf, &section_str_index) < 0);
+      report_elf_if(elf_getshdrnum(part->elf, &num_shdrs) < 0);
+
+      part->num_sections = num_shdrs;
+      part->sections = calloc(sizeof(*part->sections), num_shdrs);
+      report_if(!part->sections);
+
+      Elf_Scn *section = NULL;
+      while ((section = elf_nextscn(part->elf, section))) {
+         Elf64_Shdr *shdr = elf64_getshdr(section);
+         struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)];
+         s->name = elf_strptr(part->elf, section_str_index, shdr->sh_name);
+         report_elf_if(!s->name);
+
+         /* Cannot actually handle linked objects yet */
+         report_elf_if(shdr->sh_addr != 0);
+
+         /* Alignment must be 0 or a power of two */
+         report_elf_if(shdr->sh_addralign & (shdr->sh_addralign - 1));
+         uint64_t sh_align = MAX2(shdr->sh_addralign, 1);
+
+         if (shdr->sh_flags & SHF_ALLOC && shdr->sh_type != SHT_NOTE) {
+            report_if(shdr->sh_flags & SHF_WRITE);
+
+            s->is_rx = true;
+
+            if (shdr->sh_flags & SHF_EXECINSTR) {
+               report_elf_if(shdr->sh_size & 3);
+
+               if (!strcmp(s->name, ".text"))
+                  s->is_pasted_text = true;
+
+               exec_size += shdr->sh_size;
+            }
+
+            if (s->is_pasted_text) {
+               s->offset = pasted_text_size;
+               pasted_text_size += shdr->sh_size;
+            } else {
+               rx_align = align(rx_align, sh_align);
+               rx_size = align(rx_size, sh_align);
+               s->offset = rx_size;
+               rx_size += shdr->sh_size;
+            }
+         } else if (shdr->sh_type == SHT_SYMTAB) {
+            if (!read_private_lds_symbols(binary, part_idx, section, &lds_end_align))
+               goto fail;
+         }
+      }
+
+      uint64_t part_lds_size = shared_lds_size;
+      if (!layout_symbols(util_dynarray_element(&binary->lds_symbols, struct ac_rtld_symbol,
+                                                part_lds_symbols_begin),
+                          util_dynarray_num_elements(&binary->lds_symbols, struct ac_rtld_symbol) -
+                             part_lds_symbols_begin,
+                          &part_lds_size))
+         goto fail;
+      binary->lds_size = MAX2(binary->lds_size, part_lds_size);
+   }
+
+   binary->rx_end_markers = pasted_text_size;
+   pasted_text_size += 4 * DEBUGGER_NUM_MARKERS;
+
+   /* __lds_end is a special symbol that points at the end of the memory
+    * occupied by other LDS symbols. Its alignment is taken as the
+    * maximum of its alignment over all shader parts where it occurs.
+    */
+   if (lds_end_align) {
+      binary->lds_size = align(binary->lds_size, lds_end_align);
+
+      struct ac_rtld_symbol *lds_end =
+         util_dynarray_grow(&binary->lds_symbols, struct ac_rtld_symbol, 1);
+      lds_end->name = "__lds_end";
+      lds_end->size = 0;
+      lds_end->align = lds_end_align;
+      lds_end->offset = binary->lds_size;
+      lds_end->part_idx = ~0u;
+   }
+
+   if (binary->lds_size > max_lds_size) {
+      fprintf(stderr, "ac_rtld error(2): too much LDS (used = %u, max = %u)\n",
+              (unsigned)binary->lds_size, max_lds_size);
+      goto fail;
+   }
+
+   /* Second pass: Adjust offsets of non-pasted text sections. */
+   binary->rx_size = pasted_text_size;
+   binary->rx_size = align(binary->rx_size, rx_align);
+
+   for (unsigned part_idx = 0; part_idx < i.num_parts; ++part_idx) {
+      struct ac_rtld_part *part = &binary->parts[part_idx];
+      size_t num_shdrs;
+      elf_getshdrnum(part->elf, &num_shdrs);
+
+      for (unsigned j = 0; j < num_shdrs; ++j) {
+         struct ac_rtld_section *s = &part->sections[j];
+         if (s->is_rx && !s->is_pasted_text)
+            s->offset += binary->rx_size;
+      }
+   }
+
+   binary->rx_size += rx_size;
+   binary->exec_size = exec_size;
+
+   if (i.info->chip_class >= GFX10) {
+      /* In gfx10, the SQ fetches up to 3 cache lines of 16 dwords
+       * ahead of the PC, configurable by SH_MEM_CONFIG and
+       * S_INST_PREFETCH. This can cause two issues:
+       *
+       * (1) Crossing a page boundary to an unmapped page. The logic
+       *     does not distinguish between a required fetch and a "mere"
+       *     prefetch and will fault.
+       *
+       * (2) Prefetching instructions that will be changed for a
+       *     different shader.
+       *
+       * (2) is not currently an issue because we flush the I$ at IB
+       * boundaries, but (1) needs to be addressed. Due to buffer
+       * suballocation, we just play it safe.
+       */
+      binary->rx_size = align(binary->rx_size + 3 * 64, 64);
+   }
+
+   return true;
 
 #undef report_if
 #undef report_elf_if
 
 fail:
-       ac_rtld_close(binary);
-       return false;
+   ac_rtld_close(binary);
+   return false;
 }
 
 void ac_rtld_close(struct ac_rtld_binary *binary)
 {
-       for (unsigned i = 0; i < binary->num_parts; ++i) {
-               struct ac_rtld_part *part = &binary->parts[i];
-               free(part->sections);
-               elf_end(part->elf);
-       }
-
-       util_dynarray_fini(&binary->lds_symbols);
-       free(binary->parts);
-       binary->parts = NULL;
-       binary->num_parts = 0;
+   for (unsigned i = 0; i < binary->num_parts; ++i) {
+      struct ac_rtld_part *part = &binary->parts[i];
+      free(part->sections);
+      elf_end(part->elf);
+   }
+
+   util_dynarray_fini(&binary->lds_symbols);
+   free(binary->parts);
+   binary->parts = NULL;
+   binary->num_parts = 0;
 }
 
-static bool get_section_by_name(struct ac_rtld_part *part, const char *name,
-                               const char **data, size_t *nbytes)
+static bool get_section_by_name(struct ac_rtld_part *part, const char *name, const char **data,
+                                size_t *nbytes)
 {
-       for (unsigned i = 0; i < part->num_sections; ++i) {
-               struct ac_rtld_section *s = &part->sections[i];
-               if (s->name && !strcmp(name, s->name)) {
-                       Elf_Scn *target_scn = elf_getscn(part->elf, i);
-                       Elf_Data *target_data = elf_getdata(target_scn, NULL);
-                       if (!target_data) {
-                               report_elf_errorf("ac_rtld: get_section_by_name: elf_getdata");
-                               return false;
-                       }
-
-                       *data = target_data->d_buf;
-                       *nbytes = target_data->d_size;
-                       return true;
-               }
-       }
-       return false;
+   for (unsigned i = 0; i < part->num_sections; ++i) {
+      struct ac_rtld_section *s = &part->sections[i];
+      if (s->name && !strcmp(name, s->name)) {
+         Elf_Scn *target_scn = elf_getscn(part->elf, i);
+         Elf_Data *target_data = elf_getdata(target_scn, NULL);
+         if (!target_data) {
+            report_elf_errorf("ac_rtld: get_section_by_name: elf_getdata");
+            return false;
+         }
+
+         *data = target_data->d_buf;
+         *nbytes = target_data->d_size;
+         return true;
+      }
+   }
+   return false;
 }
 
-bool ac_rtld_get_section_by_name(struct ac_rtld_binary *binary, const char *name,
-                                const char **data, size_t *nbytes)
+bool ac_rtld_get_section_by_name(struct ac_rtld_binary *binary, const char *name, const char **data,
+                                 size_t *nbytes)
 {
-       assert(binary->num_parts == 1);
-       return get_section_by_name(&binary->parts[0], name, data, nbytes);
+   assert(binary->num_parts == 1);
+   return get_section_by_name(&binary->parts[0], name, data, nbytes);
 }
 
-bool ac_rtld_read_config(const struct radeon_info *info,
-                        struct ac_rtld_binary *binary,
-                        struct ac_shader_config *config)
+bool ac_rtld_read_config(const struct radeon_info *info, struct ac_rtld_binary *binary,
+                         struct ac_shader_config *config)
 {
-       for (unsigned i = 0; i < binary->num_parts; ++i) {
-               struct ac_rtld_part *part = &binary->parts[i];
-               const char *config_data;
-               size_t config_nbytes;
-
-               if (!get_section_by_name(part, ".AMDGPU.config",
-                                        &config_data, &config_nbytes))
-                       return false;
-
-               /* TODO: be precise about scratch use? */
-               struct ac_shader_config c = {};
-               ac_parse_shader_binary_config(config_data, config_nbytes,
-                                             binary->wave_size, true, info, &c);
-
-               config->num_sgprs = MAX2(config->num_sgprs, c.num_sgprs);
-               config->num_vgprs = MAX2(config->num_vgprs, c.num_vgprs);
-               config->spilled_sgprs = MAX2(config->spilled_sgprs, c.spilled_sgprs);
-               config->spilled_vgprs = MAX2(config->spilled_vgprs, c.spilled_vgprs);
-               config->scratch_bytes_per_wave = MAX2(config->scratch_bytes_per_wave,
-                                                     c.scratch_bytes_per_wave);
-
-               assert(i == 0 || config->float_mode == c.float_mode);
-               config->float_mode = c.float_mode;
-
-               /* SPI_PS_INPUT_ENA/ADDR can't be combined. Only the value from
-                * the main shader part is used. */
-               assert(config->spi_ps_input_ena == 0 &&
-                      config->spi_ps_input_addr == 0);
-               config->spi_ps_input_ena = c.spi_ps_input_ena;
-               config->spi_ps_input_addr = c.spi_ps_input_addr;
-
-               /* TODO: consistently use LDS symbols for this */
-               config->lds_size = MAX2(config->lds_size, c.lds_size);
-
-               /* TODO: Should we combine these somehow? It's currently only
-                * used for radeonsi's compute, where multiple parts aren't used. */
-               assert(config->rsrc1 == 0 && config->rsrc2 == 0);
-               config->rsrc1 = c.rsrc1;
-               config->rsrc2 = c.rsrc2;
-       }
-
-       return true;
+   for (unsigned i = 0; i < binary->num_parts; ++i) {
+      struct ac_rtld_part *part = &binary->parts[i];
+      const char *config_data;
+      size_t config_nbytes;
+
+      if (!get_section_by_name(part, ".AMDGPU.config", &config_data, &config_nbytes))
+         return false;
+
+      /* TODO: be precise about scratch use? */
+      struct ac_shader_config c = {};
+      ac_parse_shader_binary_config(config_data, config_nbytes, binary->wave_size, true, info, &c);
+
+      config->num_sgprs = MAX2(config->num_sgprs, c.num_sgprs);
+      config->num_vgprs = MAX2(config->num_vgprs, c.num_vgprs);
+      config->spilled_sgprs = MAX2(config->spilled_sgprs, c.spilled_sgprs);
+      config->spilled_vgprs = MAX2(config->spilled_vgprs, c.spilled_vgprs);
+      config->scratch_bytes_per_wave =
+         MAX2(config->scratch_bytes_per_wave, c.scratch_bytes_per_wave);
+
+      assert(i == 0 || config->float_mode == c.float_mode);
+      config->float_mode = c.float_mode;
+
+      /* SPI_PS_INPUT_ENA/ADDR can't be combined. Only the value from
+       * the main shader part is used. */
+      assert(config->spi_ps_input_ena == 0 && config->spi_ps_input_addr == 0);
+      config->spi_ps_input_ena = c.spi_ps_input_ena;
+      config->spi_ps_input_addr = c.spi_ps_input_addr;
+
+      /* TODO: consistently use LDS symbols for this */
+      config->lds_size = MAX2(config->lds_size, c.lds_size);
+
+      /* TODO: Should we combine these somehow? It's currently only
+       * used for radeonsi's compute, where multiple parts aren't used. */
+      assert(config->rsrc1 == 0 && config->rsrc2 == 0);
+      config->rsrc1 = c.rsrc1;
+      config->rsrc2 = c.rsrc2;
+   }
+
+   return true;
 }
 
-static bool resolve_symbol(const struct ac_rtld_upload_info *u,
-                          unsigned part_idx, const Elf64_Sym *sym,
-                          const char *name, uint64_t *value)
+static bool resolve_symbol(const struct ac_rtld_upload_info *u, unsigned part_idx,
+                           const Elf64_Sym *sym, const char *name, uint64_t *value)
 {
-       /* TODO: properly disentangle the undef and the LDS cases once
-        * STT_AMDGPU_LDS is retired. */
-       if (sym->st_shndx == SHN_UNDEF || sym->st_shndx == SHN_AMDGPU_LDS) {
-               const struct ac_rtld_symbol *lds_sym =
-                       find_symbol(&u->binary->lds_symbols, name, part_idx);
-
-               if (lds_sym) {
-                       *value = lds_sym->offset;
-                       return true;
-               }
-
-               /* TODO: resolve from other parts */
-
-               if (u->get_external_symbol(u->cb_data, name, value))
-                       return true;
-
-               report_errorf("symbol %s: unknown", name);
-               return false;
-       }
-
-       struct ac_rtld_part *part = &u->binary->parts[part_idx];
-       if (sym->st_shndx >= part->num_sections) {
-               report_errorf("symbol %s: section out of bounds", name);
-               return false;
-       }
-
-       struct ac_rtld_section *s = &part->sections[sym->st_shndx];
-       if (!s->is_rx) {
-               report_errorf("symbol %s: bad section", name);
-               return false;
-       }
-
-       uint64_t section_base = u->rx_va + s->offset;
-
-       *value = section_base + sym->st_value;
-       return true;
+   /* TODO: properly disentangle the undef and the LDS cases once
+    * STT_AMDGPU_LDS is retired. */
+   if (sym->st_shndx == SHN_UNDEF || sym->st_shndx == SHN_AMDGPU_LDS) {
+      const struct ac_rtld_symbol *lds_sym = find_symbol(&u->binary->lds_symbols, name, part_idx);
+
+      if (lds_sym) {
+         *value = lds_sym->offset;
+         return true;
+      }
+
+      /* TODO: resolve from other parts */
+
+      if (u->get_external_symbol(u->cb_data, name, value))
+         return true;
+
+      report_errorf("symbol %s: unknown", name);
+      return false;
+   }
+
+   struct ac_rtld_part *part = &u->binary->parts[part_idx];
+   if (sym->st_shndx >= part->num_sections) {
+      report_errorf("symbol %s: section out of bounds", name);
+      return false;
+   }
+
+   struct ac_rtld_section *s = &part->sections[sym->st_shndx];
+   if (!s->is_rx) {
+      report_errorf("symbol %s: bad section", name);
+      return false;
+   }
+
+   uint64_t section_base = u->rx_va + s->offset;
+
+   *value = section_base + sym->st_value;
+   return true;
 }
 
-static bool apply_relocs(const struct ac_rtld_upload_info *u,
-                        unsigned part_idx, const Elf64_Shdr *reloc_shdr,
-                        const Elf_Data *reloc_data)
+static bool apply_relocs(const struct ac_rtld_upload_info *u, unsigned part_idx,
+                         const Elf64_Shdr *reloc_shdr, const Elf_Data *reloc_data)
 {
-#define report_if(cond) \
-       do { \
-               if ((cond)) { \
-                       report_errorf(#cond); \
-                       return false; \
-               } \
-       } while (false)
-#define report_elf_if(cond) \
-       do { \
-               if ((cond)) { \
-                       report_elf_errorf(#cond); \
-                       return false; \
-               } \
-       } while (false)
-
-       struct ac_rtld_part *part = &u->binary->parts[part_idx];
-       Elf_Scn *target_scn = elf_getscn(part->elf, reloc_shdr->sh_info);
-       report_elf_if(!target_scn);
-
-       Elf_Data *target_data = elf_getdata(target_scn, NULL);
-       report_elf_if(!target_data);
-
-       Elf_Scn *symbols_scn = elf_getscn(part->elf, reloc_shdr->sh_link);
-       report_elf_if(!symbols_scn);
-
-       Elf64_Shdr *symbols_shdr = elf64_getshdr(symbols_scn);
-       report_elf_if(!symbols_shdr);
-       uint32_t strtabidx = symbols_shdr->sh_link;
-
-       Elf_Data *symbols_data = elf_getdata(symbols_scn, NULL);
-       report_elf_if(!symbols_data);
-
-       const Elf64_Sym *symbols = symbols_data->d_buf;
-       size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym);
-
-       struct ac_rtld_section *s = &part->sections[reloc_shdr->sh_info];
-       report_if(!s->is_rx);
-
-       const char *orig_base = target_data->d_buf;
-       char *dst_base = u->rx_ptr + s->offset;
-       uint64_t va_base = u->rx_va + s->offset;
-
-       Elf64_Rel *rel = reloc_data->d_buf;
-       size_t num_relocs = reloc_data->d_size / sizeof(*rel);
-       for (size_t i = 0; i < num_relocs; ++i, ++rel) {
-               size_t r_sym = ELF64_R_SYM(rel->r_info);
-               unsigned r_type = ELF64_R_TYPE(rel->r_info);
-
-               const char *orig_ptr = orig_base + rel->r_offset;
-               char *dst_ptr = dst_base + rel->r_offset;
-               uint64_t va = va_base + rel->r_offset;
-
-               uint64_t symbol;
-               uint64_t addend;
-
-               if (r_sym == STN_UNDEF) {
-                       symbol = 0;
-               } else {
-                       report_elf_if(r_sym >= num_symbols);
-
-                       const Elf64_Sym *sym = &symbols[r_sym];
-                       const char *symbol_name =
-                               elf_strptr(part->elf, strtabidx, sym->st_name);
-                       report_elf_if(!symbol_name);
-
-                       if (!resolve_symbol(u, part_idx, sym, symbol_name, &symbol))
-                               return false;
-               }
-
-               /* TODO: Should we also support .rela sections, where the
-                * addend is part of the relocation record? */
-
-               /* Load the addend from the ELF instead of the destination,
-                * because the destination may be in VRAM. */
-               switch (r_type) {
-               case R_AMDGPU_ABS32:
-               case R_AMDGPU_ABS32_LO:
-               case R_AMDGPU_ABS32_HI:
-               case R_AMDGPU_REL32:
-               case R_AMDGPU_REL32_LO:
-               case R_AMDGPU_REL32_HI:
-                       addend = *(const uint32_t *)orig_ptr;
-                       break;
-               case R_AMDGPU_ABS64:
-               case R_AMDGPU_REL64:
-                       addend = *(const uint64_t *)orig_ptr;
-                       break;
-               default:
-                       report_errorf("unsupported r_type == %u", r_type);
-                       return false;
-               }
-
-               uint64_t abs = symbol + addend;
-
-               switch (r_type) {
-               case R_AMDGPU_ABS32:
-                       assert((uint32_t)abs == abs);
-               case R_AMDGPU_ABS32_LO:
-                       *(uint32_t *)dst_ptr = util_cpu_to_le32(abs);
-                       break;
-               case R_AMDGPU_ABS32_HI:
-                       *(uint32_t *)dst_ptr = util_cpu_to_le32(abs >> 32);
-                       break;
-               case R_AMDGPU_ABS64:
-                       *(uint64_t *)dst_ptr = util_cpu_to_le64(abs);
-                       break;
-               case R_AMDGPU_REL32:
-                       assert((int64_t)(int32_t)(abs - va) == (int64_t)(abs - va));
-               case R_AMDGPU_REL32_LO:
-                       *(uint32_t *)dst_ptr = util_cpu_to_le32(abs - va);
-                       break;
-               case R_AMDGPU_REL32_HI:
-                       *(uint32_t *)dst_ptr = util_cpu_to_le32((abs - va) >> 32);
-                       break;
-               case R_AMDGPU_REL64:
-                       *(uint64_t *)dst_ptr = util_cpu_to_le64(abs - va);
-                       break;
-               default:
-                       unreachable("bad r_type");
-               }
-       }
-
-       return true;
+#define report_if(cond)                                                                            \
+   do {                                                                                            \
+      if ((cond)) {                                                                                \
+         report_errorf(#cond);                                                                     \
+         return false;                                                                             \
+      }                                                                                            \
+   } while (false)
+#define report_elf_if(cond)                                                                        \
+   do {                                                                                            \
+      if ((cond)) {                                                                                \
+         report_elf_errorf(#cond);                                                                 \
+         return false;                                                                             \
+      }                                                                                            \
+   } while (false)
+
+   struct ac_rtld_part *part = &u->binary->parts[part_idx];
+   Elf_Scn *target_scn = elf_getscn(part->elf, reloc_shdr->sh_info);
+   report_elf_if(!target_scn);
+
+   Elf_Data *target_data = elf_getdata(target_scn, NULL);
+   report_elf_if(!target_data);
+
+   Elf_Scn *symbols_scn = elf_getscn(part->elf, reloc_shdr->sh_link);
+   report_elf_if(!symbols_scn);
+
+   Elf64_Shdr *symbols_shdr = elf64_getshdr(symbols_scn);
+   report_elf_if(!symbols_shdr);
+   uint32_t strtabidx = symbols_shdr->sh_link;
+
+   Elf_Data *symbols_data = elf_getdata(symbols_scn, NULL);
+   report_elf_if(!symbols_data);
+
+   const Elf64_Sym *symbols = symbols_data->d_buf;
+   size_t num_symbols = symbols_data->d_size / sizeof(Elf64_Sym);
+
+   struct ac_rtld_section *s = &part->sections[reloc_shdr->sh_info];
+   report_if(!s->is_rx);
+
+   const char *orig_base = target_data->d_buf;
+   char *dst_base = u->rx_ptr + s->offset;
+   uint64_t va_base = u->rx_va + s->offset;
+
+   Elf64_Rel *rel = reloc_data->d_buf;
+   size_t num_relocs = reloc_data->d_size / sizeof(*rel);
+   for (size_t i = 0; i < num_relocs; ++i, ++rel) {
+      size_t r_sym = ELF64_R_SYM(rel->r_info);
+      unsigned r_type = ELF64_R_TYPE(rel->r_info);
+
+      const char *orig_ptr = orig_base + rel->r_offset;
+      char *dst_ptr = dst_base + rel->r_offset;
+      uint64_t va = va_base + rel->r_offset;
+
+      uint64_t symbol;
+      uint64_t addend;
+
+      if (r_sym == STN_UNDEF) {
+         symbol = 0;
+      } else {
+         report_elf_if(r_sym >= num_symbols);
+
+         const Elf64_Sym *sym = &symbols[r_sym];
+         const char *symbol_name = elf_strptr(part->elf, strtabidx, sym->st_name);
+         report_elf_if(!symbol_name);
+
+         if (!resolve_symbol(u, part_idx, sym, symbol_name, &symbol))
+            return false;
+      }
+
+      /* TODO: Should we also support .rela sections, where the
+       * addend is part of the relocation record? */
+
+      /* Load the addend from the ELF instead of the destination,
+       * because the destination may be in VRAM. */
+      switch (r_type) {
+      case R_AMDGPU_ABS32:
+      case R_AMDGPU_ABS32_LO:
+      case R_AMDGPU_ABS32_HI:
+      case R_AMDGPU_REL32:
+      case R_AMDGPU_REL32_LO:
+      case R_AMDGPU_REL32_HI:
+         addend = *(const uint32_t *)orig_ptr;
+         break;
+      case R_AMDGPU_ABS64:
+      case R_AMDGPU_REL64:
+         addend = *(const uint64_t *)orig_ptr;
+         break;
+      default:
+         report_errorf("unsupported r_type == %u", r_type);
+         return false;
+      }
+
+      uint64_t abs = symbol + addend;
+
+      switch (r_type) {
+      case R_AMDGPU_ABS32:
+         assert((uint32_t)abs == abs);
+      case R_AMDGPU_ABS32_LO:
+         *(uint32_t *)dst_ptr = util_cpu_to_le32(abs);
+         break;
+      case R_AMDGPU_ABS32_HI:
+         *(uint32_t *)dst_ptr = util_cpu_to_le32(abs >> 32);
+         break;
+      case R_AMDGPU_ABS64:
+         *(uint64_t *)dst_ptr = util_cpu_to_le64(abs);
+         break;
+      case R_AMDGPU_REL32:
+         assert((int64_t)(int32_t)(abs - va) == (int64_t)(abs - va));
+      case R_AMDGPU_REL32_LO:
+         *(uint32_t *)dst_ptr = util_cpu_to_le32(abs - va);
+         break;
+      case R_AMDGPU_REL32_HI:
+         *(uint32_t *)dst_ptr = util_cpu_to_le32((abs - va) >> 32);
+         break;
+      case R_AMDGPU_REL64:
+         *(uint64_t *)dst_ptr = util_cpu_to_le64(abs - va);
+         break;
+      default:
+         unreachable("bad r_type");
+      }
+   }
+
+   return true;
 
 #undef report_if
 #undef report_elf_if
@@ -742,72 +728,72 @@ static bool apply_relocs(const struct ac_rtld_upload_info *u,
  */
 bool ac_rtld_upload(struct ac_rtld_upload_info *u)
 {
-#define report_if(cond) \
-       do { \
-               if ((cond)) { \
-                       report_errorf(#cond); \
-                       return false; \
-               } \
-       } while (false)
-#define report_elf_if(cond) \
-       do { \
-               if ((cond)) { \
-                       report_errorf(#cond); \
-                       return false; \
-               } \
-       } while (false)
-
-       if (u->binary->options.halt_at_entry) {
-               /* s_sethalt 1 */
-               *(uint32_t *)u->rx_ptr = util_cpu_to_le32(0xbf8d0001);
-       }
-
-       /* First pass: upload raw section data and lay out private LDS symbols. */
-       for (unsigned i = 0; i < u->binary->num_parts; ++i) {
-               struct ac_rtld_part *part = &u->binary->parts[i];
-
-               Elf_Scn *section = NULL;
-               while ((section = elf_nextscn(part->elf, section))) {
-                       Elf64_Shdr *shdr = elf64_getshdr(section);
-                       struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)];
-
-                       if (!s->is_rx)
-                               continue;
-
-                       report_if(shdr->sh_type != SHT_PROGBITS);
-
-                       Elf_Data *data = elf_getdata(section, NULL);
-                       report_elf_if(!data || data->d_size != shdr->sh_size);
-                       memcpy(u->rx_ptr + s->offset, data->d_buf, shdr->sh_size);
-               }
-       }
-
-       if (u->binary->rx_end_markers) {
-               uint32_t *dst = (uint32_t *)(u->rx_ptr + u->binary->rx_end_markers);
-               for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; ++i)
-                       *dst++ = util_cpu_to_le32(DEBUGGER_END_OF_CODE_MARKER);
-       }
-
-       /* Second pass: handle relocations, overwriting uploaded data where
-        * appropriate. */
-       for (unsigned i = 0; i < u->binary->num_parts; ++i) {
-               struct ac_rtld_part *part = &u->binary->parts[i];
-               Elf_Scn *section = NULL;
-               while ((section = elf_nextscn(part->elf, section))) {
-                       Elf64_Shdr *shdr = elf64_getshdr(section);
-                       if (shdr->sh_type == SHT_REL) {
-                               Elf_Data *relocs = elf_getdata(section, NULL);
-                               report_elf_if(!relocs || relocs->d_size != shdr->sh_size);
-                               if (!apply_relocs(u, i, shdr, relocs))
-                                       return false;
-                       } else if (shdr->sh_type == SHT_RELA) {
-                               report_errorf("SHT_RELA not supported");
-                               return false;
-                       }
-               }
-       }
-
-       return true;
+#define report_if(cond)                                                                            \
+   do {                                                                                            \
+      if ((cond)) {                                                                                \
+         report_errorf(#cond);                                                                     \
+         return false;                                                                             \
+      }                                                                                            \
+   } while (false)
+#define report_elf_if(cond)                                                                        \
+   do {                                                                                            \
+      if ((cond)) {                                                                                \
+         report_errorf(#cond);                                                                     \
+         return false;                                                                             \
+      }                                                                                            \
+   } while (false)
+
+   if (u->binary->options.halt_at_entry) {
+      /* s_sethalt 1 */
+      *(uint32_t *)u->rx_ptr = util_cpu_to_le32(0xbf8d0001);
+   }
+
+   /* First pass: upload raw section data and lay out private LDS symbols. */
+   for (unsigned i = 0; i < u->binary->num_parts; ++i) {
+      struct ac_rtld_part *part = &u->binary->parts[i];
+
+      Elf_Scn *section = NULL;
+      while ((section = elf_nextscn(part->elf, section))) {
+         Elf64_Shdr *shdr = elf64_getshdr(section);
+         struct ac_rtld_section *s = &part->sections[elf_ndxscn(section)];
+
+         if (!s->is_rx)
+            continue;
+
+         report_if(shdr->sh_type != SHT_PROGBITS);
+
+         Elf_Data *data = elf_getdata(section, NULL);
+         report_elf_if(!data || data->d_size != shdr->sh_size);
+         memcpy(u->rx_ptr + s->offset, data->d_buf, shdr->sh_size);
+      }
+   }
+
+   if (u->binary->rx_end_markers) {
+      uint32_t *dst = (uint32_t *)(u->rx_ptr + u->binary->rx_end_markers);
+      for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; ++i)
+         *dst++ = util_cpu_to_le32(DEBUGGER_END_OF_CODE_MARKER);
+   }
+
+   /* Second pass: handle relocations, overwriting uploaded data where
+    * appropriate. */
+   for (unsigned i = 0; i < u->binary->num_parts; ++i) {
+      struct ac_rtld_part *part = &u->binary->parts[i];
+      Elf_Scn *section = NULL;
+      while ((section = elf_nextscn(part->elf, section))) {
+         Elf64_Shdr *shdr = elf64_getshdr(section);
+         if (shdr->sh_type == SHT_REL) {
+            Elf_Data *relocs = elf_getdata(section, NULL);
+            report_elf_if(!relocs || relocs->d_size != shdr->sh_size);
+            if (!apply_relocs(u, i, shdr, relocs))
+               return false;
+         } else if (shdr->sh_type == SHT_RELA) {
+            report_errorf("SHT_RELA not supported");
+            return false;
+         }
+      }
+   }
+
+   return true;
 
 #undef report_if
 #undef report_elf_if
index 2470a5243f1ea4b5bd077d2ffede7d28fc0b1e89..af03a857f5e6ccb150ce6c9ea932293618fa86b9 100644 (file)
 #ifndef AC_RTLD_H
 #define AC_RTLD_H
 
+#include "compiler/shader_enums.h"
+#include "util/u_dynarray.h"
+
 #include <stdbool.h>
-#include <stdint.h>
 #include <stddef.h>
-
-#include "util/u_dynarray.h"
-#include "compiler/shader_enums.h"
+#include <stdint.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -40,37 +40,37 @@ struct ac_shader_config;
 struct radeon_info;
 
 struct ac_rtld_symbol {
-       const char *name;
-       uint32_t size;
-       uint32_t align;
-       uint64_t offset; /* filled in by ac_rtld_open */
-       unsigned part_idx; /* shader part in which this symbol appears */
+   const char *name;
+   uint32_t size;
+   uint32_t align;
+   uint64_t offset;   /* filled in by ac_rtld_open */
+   unsigned part_idx; /* shader part in which this symbol appears */
 };
 
 struct ac_rtld_options {
-       /* Loader will insert an s_sethalt 1 instruction as the
-        * first instruction. */
-       bool halt_at_entry:1;
+   /* Loader will insert an s_sethalt 1 instruction as the
+    * first instruction. */
+   bool halt_at_entry : 1;
 };
 
 /* Lightweight wrapper around underlying ELF objects. */
 struct ac_rtld_binary {
-       struct ac_rtld_options options;
-       unsigned wave_size;
+   struct ac_rtld_options options;
+   unsigned wave_size;
 
-       /* Required buffer sizes, currently read/executable only. */
-       uint64_t rx_size;
+   /* Required buffer sizes, currently read/executable only. */
+   uint64_t rx_size;
 
-       /* Size of executable code, for reporting purposes. */
-       uint64_t exec_size;
+   /* Size of executable code, for reporting purposes. */
+   uint64_t exec_size;
 
-       uint64_t rx_end_markers;
+   uint64_t rx_end_markers;
 
-       unsigned num_parts;
-       struct ac_rtld_part *parts;
+   unsigned num_parts;
+   struct ac_rtld_part *parts;
 
-       struct util_dynarray lds_symbols;
-       uint32_t lds_size;
+   struct util_dynarray lds_symbols;
+   uint32_t lds_size;
 };
 
 /**
@@ -82,8 +82,7 @@ struct ac_rtld_binary {
  * \param value to be filled in by the callback
  * \return whether the symbol was found successfully
  */
-typedef bool (*ac_rtld_get_external_symbol_cb)(
-       void *cb_data, const char *symbol, uint64_t *value);
+typedef bool (*ac_rtld_get_external_symbol_cb)(void *cb_data, const char *symbol, uint64_t *value);
 
 /**
  * Lifetimes of \ref info, in-memory ELF objects, and the names of
@@ -91,50 +90,48 @@ typedef bool (*ac_rtld_get_external_symbol_cb)(
  * the opened binary.
  */
 struct ac_rtld_open_info {
-       const struct radeon_info *info;
-       struct ac_rtld_options options;
-       gl_shader_stage shader_type;
-       unsigned wave_size;
-
-       unsigned num_parts;
-       const char * const *elf_ptrs; /* in-memory ELF objects of each part */
-       const size_t *elf_sizes; /* sizes of corresponding in-memory ELF objects in bytes */
-
-       /* Shared LDS symbols are layouted such that they are accessible from
-        * all shader parts. Non-shared (private) LDS symbols of one part may
-        * overlap private LDS symbols of another shader part.
-        */
-       unsigned num_shared_lds_symbols;
-       const struct ac_rtld_symbol *shared_lds_symbols;
+   const struct radeon_info *info;
+   struct ac_rtld_options options;
+   gl_shader_stage shader_type;
+   unsigned wave_size;
+
+   unsigned num_parts;
+   const char *const *elf_ptrs; /* in-memory ELF objects of each part */
+   const size_t *elf_sizes;     /* sizes of corresponding in-memory ELF objects in bytes */
+
+   /* Shared LDS symbols are layouted such that they are accessible from
+    * all shader parts. Non-shared (private) LDS symbols of one part may
+    * overlap private LDS symbols of another shader part.
+    */
+   unsigned num_shared_lds_symbols;
+   const struct ac_rtld_symbol *shared_lds_symbols;
 };
 
-bool ac_rtld_open(struct ac_rtld_binary *binary,
-                 struct ac_rtld_open_info i);
+bool ac_rtld_open(struct ac_rtld_binary *binary, struct ac_rtld_open_info i);
 
 void ac_rtld_close(struct ac_rtld_binary *binary);
 
-bool ac_rtld_get_section_by_name(struct ac_rtld_binary *binary, const char *name,
-                                const char **data, size_t *nbytes);
+bool ac_rtld_get_section_by_name(struct ac_rtld_binary *binary, const char *name, const char **data,
+                                 size_t *nbytes);
 
-bool ac_rtld_read_config(const struct radeon_info *info,
-                        struct ac_rtld_binary *binary,
-                        struct ac_shader_config *config);
+bool ac_rtld_read_config(const struct radeon_info *info, struct ac_rtld_binary *binary,
+                         struct ac_shader_config *config);
 
 struct ac_rtld_upload_info {
-       struct ac_rtld_binary *binary;
+   struct ac_rtld_binary *binary;
 
-       /** GPU mapping of the read/executable buffer. */
-       uint64_t rx_va;
+   /** GPU mapping of the read/executable buffer. */
+   uint64_t rx_va;
 
-       /** CPU mapping of the read/executable buffer */
-       char *rx_ptr;
+   /** CPU mapping of the read/executable buffer */
+   char *rx_ptr;
 
-       /** Optional callback function that will be queried for symbols not
-        * defined in any of the binary's parts. */
-       ac_rtld_get_external_symbol_cb get_external_symbol;
+   /** Optional callback function that will be queried for symbols not
+    * defined in any of the binary's parts. */
+   ac_rtld_get_external_symbol_cb get_external_symbol;
 
-       /** Caller-defined data that will be passed to callback functions. */
-       void *cb_data;
+   /** Caller-defined data that will be passed to callback functions. */
+   void *cb_data;
 };
 
 bool ac_rtld_upload(struct ac_rtld_upload_info *u);
index d5600eaca4098954ce577f84069e7146799dd534..d3816e1c0fc0fb2af6f8421760917db2fdcfa6a8 100644 (file)
  */
 
 #include "ac_shader_args.h"
+
 #include "nir/nir_builder.h"
 
-void
-ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile,
-          unsigned size, enum ac_arg_type type, struct ac_arg *arg)
+void ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile, unsigned size,
+                enum ac_arg_type type, struct ac_arg *arg)
 {
-       assert(info->arg_count < AC_MAX_ARGS);
+   assert(info->arg_count < AC_MAX_ARGS);
 
-       unsigned offset;
-       if (regfile == AC_ARG_SGPR) {
-               offset = info->num_sgprs_used;
-               info->num_sgprs_used += size;
-       } else {
-               assert(regfile == AC_ARG_VGPR);
-               offset = info->num_vgprs_used;
-               info->num_vgprs_used += size;
-       }
+   unsigned offset;
+   if (regfile == AC_ARG_SGPR) {
+      offset = info->num_sgprs_used;
+      info->num_sgprs_used += size;
+   } else {
+      assert(regfile == AC_ARG_VGPR);
+      offset = info->num_vgprs_used;
+      info->num_vgprs_used += size;
+   }
 
-       info->args[info->arg_count].file = regfile;
-       info->args[info->arg_count].offset = offset;
-       info->args[info->arg_count].size = size;
-       info->args[info->arg_count].type = type;
+   info->args[info->arg_count].file = regfile;
+   info->args[info->arg_count].offset = offset;
+   info->args[info->arg_count].size = size;
+   info->args[info->arg_count].type = type;
 
-       if (arg) {
-               arg->arg_index = info->arg_count;
-               arg->used = true;
-       }
+   if (arg) {
+      arg->arg_index = info->arg_count;
+      arg->used = true;
+   }
 
-       info->arg_count++;
+   info->arg_count++;
 }
-
index 90798c6eabd835b5047fdfe92c4ec6824308bb9e..c3f4042d3ec3d019377df53989988520fcf179c2 100644 (file)
 #ifndef AC_SHADER_ARGS_H
 #define AC_SHADER_ARGS_H
 
-#include <stdint.h>
 #include <stdbool.h>
+#include <stdint.h>
 
 #define AC_MAX_INLINE_PUSH_CONSTS 8
 
-enum ac_arg_regfile {
-       AC_ARG_SGPR,
-       AC_ARG_VGPR,
+enum ac_arg_regfile
+{
+   AC_ARG_SGPR,
+   AC_ARG_VGPR,
 };
 
-enum ac_arg_type {
-       AC_ARG_FLOAT,
-       AC_ARG_INT,
-       AC_ARG_CONST_PTR, /* Pointer to i8 array */
-       AC_ARG_CONST_FLOAT_PTR, /* Pointer to f32 array */
-       AC_ARG_CONST_PTR_PTR, /* Pointer to pointer to i8 array */
-       AC_ARG_CONST_DESC_PTR, /* Pointer to v4i32 array */
-       AC_ARG_CONST_IMAGE_PTR, /* Pointer to v8i32 array */
+enum ac_arg_type
+{
+   AC_ARG_FLOAT,
+   AC_ARG_INT,
+   AC_ARG_CONST_PTR,       /* Pointer to i8 array */
+   AC_ARG_CONST_FLOAT_PTR, /* Pointer to f32 array */
+   AC_ARG_CONST_PTR_PTR,   /* Pointer to pointer to i8 array */
+   AC_ARG_CONST_DESC_PTR,  /* Pointer to v4i32 array */
+   AC_ARG_CONST_IMAGE_PTR, /* Pointer to v8i32 array */
 };
 
 struct ac_arg {
-       uint8_t arg_index;
-       bool used;
+   uint8_t arg_index;
+   bool used;
 };
 
-
 #define AC_MAX_ARGS 128
 
 struct ac_shader_args {
-       /* Info on how to declare arguments */
-       struct {
-               enum ac_arg_type type;
-               enum ac_arg_regfile file;
-               uint8_t offset;
-               uint8_t size;
-               bool skip;
-       } args[AC_MAX_ARGS];
-
-       uint8_t arg_count;
-       uint8_t sgpr_count;
-       uint8_t num_sgprs_used;
-       uint8_t num_vgprs_used;
-
-       struct ac_arg base_vertex;
-       struct ac_arg start_instance;
-       struct ac_arg draw_id;
-       struct ac_arg vertex_id;
-       struct ac_arg instance_id;
-       struct ac_arg tcs_patch_id;
-       struct ac_arg tcs_rel_ids;
-       struct ac_arg tes_patch_id;
-       struct ac_arg gs_prim_id;
-       struct ac_arg gs_invocation_id;
-
-       /* PS */
-       struct ac_arg frag_pos[4];
-       struct ac_arg front_face;
-       struct ac_arg ancillary;
-       struct ac_arg sample_coverage;
-       struct ac_arg prim_mask;
-       struct ac_arg persp_sample;
-       struct ac_arg persp_center;
-       struct ac_arg persp_centroid;
-       struct ac_arg pull_model;
-       struct ac_arg linear_sample;
-       struct ac_arg linear_center;
-       struct ac_arg linear_centroid;
-
-       /* CS */
-       struct ac_arg local_invocation_ids;
-       struct ac_arg num_work_groups;
-       struct ac_arg workgroup_ids[3];
-       struct ac_arg tg_size;
-
-       /* Vulkan only */
-       struct ac_arg push_constants;
-       struct ac_arg inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS];
-       unsigned num_inline_push_consts;
-       unsigned base_inline_push_consts;
-       struct ac_arg view_index;
+   /* Info on how to declare arguments */
+   struct {
+      enum ac_arg_type type;
+      enum ac_arg_regfile file;
+      uint8_t offset;
+      uint8_t size;
+      bool skip;
+   } args[AC_MAX_ARGS];
+
+   uint8_t arg_count;
+   uint8_t sgpr_count;
+   uint8_t num_sgprs_used;
+   uint8_t num_vgprs_used;
+
+   struct ac_arg base_vertex;
+   struct ac_arg start_instance;
+   struct ac_arg draw_id;
+   struct ac_arg vertex_id;
+   struct ac_arg instance_id;
+   struct ac_arg tcs_patch_id;
+   struct ac_arg tcs_rel_ids;
+   struct ac_arg tes_patch_id;
+   struct ac_arg gs_prim_id;
+   struct ac_arg gs_invocation_id;
+
+   /* PS */
+   struct ac_arg frag_pos[4];
+   struct ac_arg front_face;
+   struct ac_arg ancillary;
+   struct ac_arg sample_coverage;
+   struct ac_arg prim_mask;
+   struct ac_arg persp_sample;
+   struct ac_arg persp_center;
+   struct ac_arg persp_centroid;
+   struct ac_arg pull_model;
+   struct ac_arg linear_sample;
+   struct ac_arg linear_center;
+   struct ac_arg linear_centroid;
+
+   /* CS */
+   struct ac_arg local_invocation_ids;
+   struct ac_arg num_work_groups;
+   struct ac_arg workgroup_ids[3];
+   struct ac_arg tg_size;
+
+   /* Vulkan only */
+   struct ac_arg push_constants;
+   struct ac_arg inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS];
+   unsigned num_inline_push_consts;
+   unsigned base_inline_push_consts;
+   struct ac_arg view_index;
 };
 
-void ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile,
-               unsigned registers, enum ac_arg_type type,
-               struct ac_arg *arg);
+void ac_add_arg(struct ac_shader_args *info, enum ac_arg_regfile regfile, unsigned registers,
+                enum ac_arg_type type, struct ac_arg *arg);
 
 #endif
-
index d4ccf38d803f2d6d6af5f468f73d13e620f6540d..a57b5cac50523a103178cfd97a9b4a1d870a4a6d 100644 (file)
  * IN THE SOFTWARE.
  */
 
+#include "ac_shader_util.h"
+
+#include "sid.h"
+
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include "ac_shader_util.h"
-#include "sid.h"
-
-unsigned
-ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
-                          bool writes_samplemask)
+unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask)
 {
-       if (writes_z) {
-               /* Z needs 32 bits. */
-               if (writes_samplemask)
-                       return V_028710_SPI_SHADER_32_ABGR;
-               else if (writes_stencil)
-                       return V_028710_SPI_SHADER_32_GR;
-               else
-                       return V_028710_SPI_SHADER_32_R;
-       } else if (writes_stencil || writes_samplemask) {
-               /* Both stencil and sample mask need only 16 bits. */
-               return V_028710_SPI_SHADER_UINT16_ABGR;
-       } else {
-               return V_028710_SPI_SHADER_ZERO;
-       }
+   if (writes_z) {
+      /* Z needs 32 bits. */
+      if (writes_samplemask)
+         return V_028710_SPI_SHADER_32_ABGR;
+      else if (writes_stencil)
+         return V_028710_SPI_SHADER_32_GR;
+      else
+         return V_028710_SPI_SHADER_32_R;
+   } else if (writes_stencil || writes_samplemask) {
+      /* Both stencil and sample mask need only 16 bits. */
+      return V_028710_SPI_SHADER_UINT16_ABGR;
+   } else {
+      return V_028710_SPI_SHADER_ZERO;
+   }
 }
 
-unsigned
-ac_get_cb_shader_mask(unsigned spi_shader_col_format)
+unsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format)
 {
-       unsigned i, cb_shader_mask = 0;
-
-       for (i = 0; i < 8; i++) {
-               switch ((spi_shader_col_format >> (i * 4)) & 0xf) {
-               case V_028714_SPI_SHADER_ZERO:
-                       break;
-               case V_028714_SPI_SHADER_32_R:
-                       cb_shader_mask |= 0x1 << (i * 4);
-                       break;
-               case V_028714_SPI_SHADER_32_GR:
-                       cb_shader_mask |= 0x3 << (i * 4);
-                       break;
-               case V_028714_SPI_SHADER_32_AR:
-                       cb_shader_mask |= 0x9u << (i * 4);
-                       break;
-               case V_028714_SPI_SHADER_FP16_ABGR:
-               case V_028714_SPI_SHADER_UNORM16_ABGR:
-               case V_028714_SPI_SHADER_SNORM16_ABGR:
-               case V_028714_SPI_SHADER_UINT16_ABGR:
-               case V_028714_SPI_SHADER_SINT16_ABGR:
-               case V_028714_SPI_SHADER_32_ABGR:
-                       cb_shader_mask |= 0xfu << (i * 4);
-                       break;
-               default:
-                       assert(0);
-               }
-       }
-       return cb_shader_mask;
+   unsigned i, cb_shader_mask = 0;
+
+   for (i = 0; i < 8; i++) {
+      switch ((spi_shader_col_format >> (i * 4)) & 0xf) {
+      case V_028714_SPI_SHADER_ZERO:
+         break;
+      case V_028714_SPI_SHADER_32_R:
+         cb_shader_mask |= 0x1 << (i * 4);
+         break;
+      case V_028714_SPI_SHADER_32_GR:
+         cb_shader_mask |= 0x3 << (i * 4);
+         break;
+      case V_028714_SPI_SHADER_32_AR:
+         cb_shader_mask |= 0x9u << (i * 4);
+         break;
+      case V_028714_SPI_SHADER_FP16_ABGR:
+      case V_028714_SPI_SHADER_UNORM16_ABGR:
+      case V_028714_SPI_SHADER_SNORM16_ABGR:
+      case V_028714_SPI_SHADER_UINT16_ABGR:
+      case V_028714_SPI_SHADER_SINT16_ABGR:
+      case V_028714_SPI_SHADER_32_ABGR:
+         cb_shader_mask |= 0xfu << (i * 4);
+         break;
+      default:
+         assert(0);
+      }
+   }
+   return cb_shader_mask;
 }
 
 /**
  * Calculate the appropriate setting of VGT_GS_MODE when \p shader is a
  * geometry shader.
  */
-uint32_t
-ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class)
+uint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class)
 {
-       unsigned cut_mode;
-
-       if (gs_max_vert_out <= 128) {
-               cut_mode = V_028A40_GS_CUT_128;
-       } else if (gs_max_vert_out <= 256) {
-               cut_mode = V_028A40_GS_CUT_256;
-       } else if (gs_max_vert_out <= 512) {
-               cut_mode = V_028A40_GS_CUT_512;
-       } else {
-               assert(gs_max_vert_out <= 1024);
-               cut_mode = V_028A40_GS_CUT_1024;
-       }
-
-       return S_028A40_MODE(V_028A40_GS_SCENARIO_G) |
-              S_028A40_CUT_MODE(cut_mode)|
-              S_028A40_ES_WRITE_OPTIMIZE(chip_class <= GFX8) |
-              S_028A40_GS_WRITE_OPTIMIZE(1) |
-              S_028A40_ONCHIP(chip_class >= GFX9 ? 1 : 0);
+   unsigned cut_mode;
+
+   if (gs_max_vert_out <= 128) {
+      cut_mode = V_028A40_GS_CUT_128;
+   } else if (gs_max_vert_out <= 256) {
+      cut_mode = V_028A40_GS_CUT_256;
+   } else if (gs_max_vert_out <= 512) {
+      cut_mode = V_028A40_GS_CUT_512;
+   } else {
+      assert(gs_max_vert_out <= 1024);
+      cut_mode = V_028A40_GS_CUT_1024;
+   }
+
+   return S_028A40_MODE(V_028A40_GS_SCENARIO_G) | S_028A40_CUT_MODE(cut_mode) |
+          S_028A40_ES_WRITE_OPTIMIZE(chip_class <= GFX8) | S_028A40_GS_WRITE_OPTIMIZE(1) |
+          S_028A40_ONCHIP(chip_class >= GFX9 ? 1 : 0);
 }
 
 /// Translate a (dfmt, nfmt) pair into a chip-appropriate combined format
 /// value for LLVM8+ tbuffer intrinsics.
-unsigned
-ac_get_tbuffer_format(enum chip_class chip_class,
-                     unsigned dfmt, unsigned nfmt)
+unsigned ac_get_tbuffer_format(enum chip_class chip_class, unsigned dfmt, unsigned nfmt)
 {
-       // Some games try to access vertex buffers without a valid format.
-       // This is a game bug, but we should still handle it gracefully.
-       if (dfmt == V_008F0C_IMG_FORMAT_INVALID)
-               return V_008F0C_IMG_FORMAT_INVALID;
-
-       if (chip_class >= GFX10) {
-               unsigned format;
-               switch (dfmt) {
-               default: unreachable("bad dfmt");
-               case V_008F0C_BUF_DATA_FORMAT_INVALID: format = V_008F0C_IMG_FORMAT_INVALID; break;
-               case V_008F0C_BUF_DATA_FORMAT_8: format = V_008F0C_IMG_FORMAT_8_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_8_8: format = V_008F0C_IMG_FORMAT_8_8_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_8_8_8_8: format = V_008F0C_IMG_FORMAT_8_8_8_8_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_16: format = V_008F0C_IMG_FORMAT_16_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_16_16: format = V_008F0C_IMG_FORMAT_16_16_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_16_16_16_16: format = V_008F0C_IMG_FORMAT_16_16_16_16_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_32: format = V_008F0C_IMG_FORMAT_32_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_32_32: format = V_008F0C_IMG_FORMAT_32_32_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_32_32_32_32: format = V_008F0C_IMG_FORMAT_32_32_32_32_UINT; break;
-               case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: format = V_008F0C_IMG_FORMAT_2_10_10_10_UINT; break;
-               }
-
-               // Use the regularity properties of the combined format enum.
-               //
-               // Note: float is incompatible with 8-bit data formats,
-               //       [us]{norm,scaled} are incomparible with 32-bit data formats.
-               //       [us]scaled are not writable.
-               switch (nfmt) {
-               case V_008F0C_BUF_NUM_FORMAT_UNORM: format -= 4; break;
-               case V_008F0C_BUF_NUM_FORMAT_SNORM: format -= 3; break;
-               case V_008F0C_BUF_NUM_FORMAT_USCALED: format -= 2; break;
-               case V_008F0C_BUF_NUM_FORMAT_SSCALED: format -= 1; break;
-               default: unreachable("bad nfmt");
-               case V_008F0C_BUF_NUM_FORMAT_UINT: break;
-               case V_008F0C_BUF_NUM_FORMAT_SINT: format += 1; break;
-               case V_008F0C_BUF_NUM_FORMAT_FLOAT: format += 2; break;
-               }
-
-               return format;
-       } else {
-               return dfmt | (nfmt << 4);
-       }
+   // Some games try to access vertex buffers without a valid format.
+   // This is a game bug, but we should still handle it gracefully.
+   if (dfmt == V_008F0C_IMG_FORMAT_INVALID)
+      return V_008F0C_IMG_FORMAT_INVALID;
+
+   if (chip_class >= GFX10) {
+      unsigned format;
+      switch (dfmt) {
+      default:
+         unreachable("bad dfmt");
+      case V_008F0C_BUF_DATA_FORMAT_INVALID:
+         format = V_008F0C_IMG_FORMAT_INVALID;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_8:
+         format = V_008F0C_IMG_FORMAT_8_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_8_8:
+         format = V_008F0C_IMG_FORMAT_8_8_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
+         format = V_008F0C_IMG_FORMAT_8_8_8_8_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_16:
+         format = V_008F0C_IMG_FORMAT_16_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_16_16:
+         format = V_008F0C_IMG_FORMAT_16_16_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
+         format = V_008F0C_IMG_FORMAT_16_16_16_16_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_32:
+         format = V_008F0C_IMG_FORMAT_32_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_32_32:
+         format = V_008F0C_IMG_FORMAT_32_32_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_32_32_32:
+         format = V_008F0C_IMG_FORMAT_32_32_32_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
+         format = V_008F0C_IMG_FORMAT_32_32_32_32_UINT;
+         break;
+      case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
+         format = V_008F0C_IMG_FORMAT_2_10_10_10_UINT;
+         break;
+      }
+
+      // Use the regularity properties of the combined format enum.
+      //
+      // Note: float is incompatible with 8-bit data formats,
+      //       [us]{norm,scaled} are incomparible with 32-bit data formats.
+      //       [us]scaled are not writable.
+      switch (nfmt) {
+      case V_008F0C_BUF_NUM_FORMAT_UNORM:
+         format -= 4;
+         break;
+      case V_008F0C_BUF_NUM_FORMAT_SNORM:
+         format -= 3;
+         break;
+      case V_008F0C_BUF_NUM_FORMAT_USCALED:
+         format -= 2;
+         break;
+      case V_008F0C_BUF_NUM_FORMAT_SSCALED:
+         format -= 1;
+         break;
+      default:
+         unreachable("bad nfmt");
+      case V_008F0C_BUF_NUM_FORMAT_UINT:
+         break;
+      case V_008F0C_BUF_NUM_FORMAT_SINT:
+         format += 1;
+         break;
+      case V_008F0C_BUF_NUM_FORMAT_FLOAT:
+         format += 2;
+         break;
+      }
+
+      return format;
+   } else {
+      return dfmt | (nfmt << 4);
+   }
 }
 
 static const struct ac_data_format_info data_format_table[] = {
-       [V_008F0C_BUF_DATA_FORMAT_INVALID]     = {  0, 4, 0, V_008F0C_BUF_DATA_FORMAT_INVALID    },
-       [V_008F0C_BUF_DATA_FORMAT_8]           = {  1, 1, 1, V_008F0C_BUF_DATA_FORMAT_8          },
-       [V_008F0C_BUF_DATA_FORMAT_16]          = {  2, 1, 2, V_008F0C_BUF_DATA_FORMAT_16         },
-       [V_008F0C_BUF_DATA_FORMAT_8_8]         = {  2, 2, 1, V_008F0C_BUF_DATA_FORMAT_8          },
-       [V_008F0C_BUF_DATA_FORMAT_32]          = {  4, 1, 4, V_008F0C_BUF_DATA_FORMAT_32         },
-       [V_008F0C_BUF_DATA_FORMAT_16_16]       = {  4, 2, 2, V_008F0C_BUF_DATA_FORMAT_16         },
-       [V_008F0C_BUF_DATA_FORMAT_10_11_11]    = {  4, 3, 0, V_008F0C_BUF_DATA_FORMAT_10_11_11   },
-       [V_008F0C_BUF_DATA_FORMAT_11_11_10]    = {  4, 3, 0, V_008F0C_BUF_DATA_FORMAT_11_11_10   },
-       [V_008F0C_BUF_DATA_FORMAT_10_10_10_2]  = {  4, 4, 0, V_008F0C_BUF_DATA_FORMAT_10_10_10_2 },
-       [V_008F0C_BUF_DATA_FORMAT_2_10_10_10]  = {  4, 4, 0, V_008F0C_BUF_DATA_FORMAT_2_10_10_10 },
-       [V_008F0C_BUF_DATA_FORMAT_8_8_8_8]     = {  4, 4, 1, V_008F0C_BUF_DATA_FORMAT_8          },
-       [V_008F0C_BUF_DATA_FORMAT_32_32]       = {  8, 2, 4, V_008F0C_BUF_DATA_FORMAT_32         },
-       [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = {  8, 4, 2, V_008F0C_BUF_DATA_FORMAT_16         },
-       [V_008F0C_BUF_DATA_FORMAT_32_32_32]    = { 12, 3, 4, V_008F0C_BUF_DATA_FORMAT_32         },
-       [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = { 16, 4, 4, V_008F0C_BUF_DATA_FORMAT_32         },
+   [V_008F0C_BUF_DATA_FORMAT_INVALID] = {0, 4, 0, V_008F0C_BUF_DATA_FORMAT_INVALID},
+   [V_008F0C_BUF_DATA_FORMAT_8] = {1, 1, 1, V_008F0C_BUF_DATA_FORMAT_8},
+   [V_008F0C_BUF_DATA_FORMAT_16] = {2, 1, 2, V_008F0C_BUF_DATA_FORMAT_16},
+   [V_008F0C_BUF_DATA_FORMAT_8_8] = {2, 2, 1, V_008F0C_BUF_DATA_FORMAT_8},
+   [V_008F0C_BUF_DATA_FORMAT_32] = {4, 1, 4, V_008F0C_BUF_DATA_FORMAT_32},
+   [V_008F0C_BUF_DATA_FORMAT_16_16] = {4, 2, 2, V_008F0C_BUF_DATA_FORMAT_16},
+   [V_008F0C_BUF_DATA_FORMAT_10_11_11] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_10_11_11},
+   [V_008F0C_BUF_DATA_FORMAT_11_11_10] = {4, 3, 0, V_008F0C_BUF_DATA_FORMAT_11_11_10},
+   [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_10_10_10_2},
+   [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = {4, 4, 0, V_008F0C_BUF_DATA_FORMAT_2_10_10_10},
+   [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = {4, 4, 1, V_008F0C_BUF_DATA_FORMAT_8},
+   [V_008F0C_BUF_DATA_FORMAT_32_32] = {8, 2, 4, V_008F0C_BUF_DATA_FORMAT_32},
+   [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = {8, 4, 2, V_008F0C_BUF_DATA_FORMAT_16},
+   [V_008F0C_BUF_DATA_FORMAT_32_32_32] = {12, 3, 4, V_008F0C_BUF_DATA_FORMAT_32},
+   [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = {16, 4, 4, V_008F0C_BUF_DATA_FORMAT_32},
 };
 
-const struct ac_data_format_info *
-ac_get_data_format_info(unsigned dfmt)
+const struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt)
 {
-       assert(dfmt < ARRAY_SIZE(data_format_table));
-       return &data_format_table[dfmt];
+   assert(dfmt < ARRAY_SIZE(data_format_table));
+   return &data_format_table[dfmt];
 }
 
-enum ac_image_dim
-ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim,
-                  bool is_array)
+enum ac_image_dim ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim,
+                                     bool is_array)
 {
-       switch (dim) {
-       case GLSL_SAMPLER_DIM_1D:
-               if (chip_class == GFX9)
-                       return is_array ? ac_image_2darray : ac_image_2d;
-               return is_array ? ac_image_1darray : ac_image_1d;
-       case GLSL_SAMPLER_DIM_2D:
-       case GLSL_SAMPLER_DIM_RECT:
-       case GLSL_SAMPLER_DIM_EXTERNAL:
-               return is_array ? ac_image_2darray : ac_image_2d;
-       case GLSL_SAMPLER_DIM_3D:
-               return ac_image_3d;
-       case GLSL_SAMPLER_DIM_CUBE:
-               return ac_image_cube;
-       case GLSL_SAMPLER_DIM_MS:
-               return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa;
-       case GLSL_SAMPLER_DIM_SUBPASS:
-               return ac_image_2darray;
-       case GLSL_SAMPLER_DIM_SUBPASS_MS:
-               return ac_image_2darraymsaa;
-       default:
-               unreachable("bad sampler dim");
-       }
+   switch (dim) {
+   case GLSL_SAMPLER_DIM_1D:
+      if (chip_class == GFX9)
+         return is_array ? ac_image_2darray : ac_image_2d;
+      return is_array ? ac_image_1darray : ac_image_1d;
+   case GLSL_SAMPLER_DIM_2D:
+   case GLSL_SAMPLER_DIM_RECT:
+   case GLSL_SAMPLER_DIM_EXTERNAL:
+      return is_array ? ac_image_2darray : ac_image_2d;
+   case GLSL_SAMPLER_DIM_3D:
+      return ac_image_3d;
+   case GLSL_SAMPLER_DIM_CUBE:
+      return ac_image_cube;
+   case GLSL_SAMPLER_DIM_MS:
+      return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa;
+   case GLSL_SAMPLER_DIM_SUBPASS:
+      return ac_image_2darray;
+   case GLSL_SAMPLER_DIM_SUBPASS_MS:
+      return ac_image_2darraymsaa;
+   default:
+      unreachable("bad sampler dim");
+   }
 }
 
-enum ac_image_dim
-ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim,
-                bool is_array)
+enum ac_image_dim ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim,
+                                   bool is_array)
 {
-       enum ac_image_dim dim = ac_get_sampler_dim(chip_class, sdim, is_array);
-
-       /* Match the resource type set in the descriptor. */
-       if (dim == ac_image_cube ||
-           (chip_class <= GFX8 && dim == ac_image_3d))
-               dim = ac_image_2darray;
-       else if (sdim == GLSL_SAMPLER_DIM_2D && !is_array && chip_class == GFX9) {
-               /* When a single layer of a 3D texture is bound, the shader
-                * will refer to a 2D target, but the descriptor has a 3D type.
-                * Since the HW ignores BASE_ARRAY in this case, we need to
-                * send 3 coordinates. This doesn't hurt when the underlying
-                * texture is non-3D.
-                */
-               dim = ac_image_3d;
-       }
-
-       return dim;
+   enum ac_image_dim dim = ac_get_sampler_dim(chip_class, sdim, is_array);
+
+   /* Match the resource type set in the descriptor. */
+   if (dim == ac_image_cube || (chip_class <= GFX8 && dim == ac_image_3d))
+      dim = ac_image_2darray;
+   else if (sdim == GLSL_SAMPLER_DIM_2D && !is_array && chip_class == GFX9) {
+      /* When a single layer of a 3D texture is bound, the shader
+       * will refer to a 2D target, but the descriptor has a 3D type.
+       * Since the HW ignores BASE_ARRAY in this case, we need to
+       * send 3 coordinates. This doesn't hurt when the underlying
+       * texture is non-3D.
+       */
+      dim = ac_image_3d;
+   }
+
+   return dim;
 }
 
-unsigned
-ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
-                        signed char *face_vgpr_index_ptr,
-                        signed char *ancillary_vgpr_index_ptr)
+unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
+                                  signed char *face_vgpr_index_ptr,
+                                  signed char *ancillary_vgpr_index_ptr)
 {
-       unsigned num_input_vgprs = 0;
-       signed char face_vgpr_index = -1;
-       signed char ancillary_vgpr_index = -1;
-
-       if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 2;
-       if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 2;
-       if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 2;
-       if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 3;
-       if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 2;
-       if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 2;
-       if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 2;
-       if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 1;
-       if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 1;
-       if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 1;
-       if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 1;
-       if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 1;
-       if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr)) {
-               face_vgpr_index = num_input_vgprs;
-               num_input_vgprs += 1;
-       }
-       if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr)) {
-               ancillary_vgpr_index = num_input_vgprs;
-               num_input_vgprs += 1;
-       }
-       if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 1;
-       if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr))
-               num_input_vgprs += 1;
-
-       if (face_vgpr_index_ptr)
-               *face_vgpr_index_ptr = face_vgpr_index;
-       if (ancillary_vgpr_index_ptr)
-               *ancillary_vgpr_index_ptr = ancillary_vgpr_index;
-
-       return num_input_vgprs;
+   unsigned num_input_vgprs = 0;
+   signed char face_vgpr_index = -1;
+   signed char ancillary_vgpr_index = -1;
+
+   if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 2;
+   if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 2;
+   if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 2;
+   if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 3;
+   if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 2;
+   if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 2;
+   if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 2;
+   if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 1;
+   if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 1;
+   if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 1;
+   if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 1;
+   if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 1;
+   if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr)) {
+      face_vgpr_index = num_input_vgprs;
+      num_input_vgprs += 1;
+   }
+   if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr)) {
+      ancillary_vgpr_index = num_input_vgprs;
+      num_input_vgprs += 1;
+   }
+   if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 1;
+   if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr))
+      num_input_vgprs += 1;
+
+   if (face_vgpr_index_ptr)
+      *face_vgpr_index_ptr = face_vgpr_index;
+   if (ancillary_vgpr_index_ptr)
+      *ancillary_vgpr_index_ptr = ancillary_vgpr_index;
+
+   return num_input_vgprs;
 }
 
-void ac_choose_spi_color_formats(unsigned format, unsigned swap,
-                                unsigned ntype, bool is_depth,
-                                struct ac_spi_color_formats *formats)
+void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype, bool is_depth,
+                                 struct ac_spi_color_formats *formats)
 {
    /* Alpha is needed for alpha-to-coverage.
     * Blending may be with or without alpha.
index 49e1eb2428f15710f437c1cce0d4a978bff97b74..c2a52337a3dfd7263b38b12ca5176d60557c1ad3 100644 (file)
 #ifndef AC_SHADER_UTIL_H
 #define AC_SHADER_UTIL_H
 
-#include <stdbool.h>
-#include <stdint.h>
-
-#include "amd_family.h"
 #include "ac_binary.h"
+#include "amd_family.h"
 #include "compiler/nir/nir.h"
 
+#include <stdbool.h>
+#include <stdint.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-enum ac_image_dim {
-       ac_image_1d,
-       ac_image_2d,
-       ac_image_3d,
-       ac_image_cube, // includes cube arrays
-       ac_image_1darray,
-       ac_image_2darray,
-       ac_image_2dmsaa,
-       ac_image_2darraymsaa,
+enum ac_image_dim
+{
+   ac_image_1d,
+   ac_image_2d,
+   ac_image_3d,
+   ac_image_cube, // includes cube arrays
+   ac_image_1darray,
+   ac_image_2darray,
+   ac_image_2dmsaa,
+   ac_image_2darraymsaa,
 };
 
 struct ac_data_format_info {
-       uint8_t element_size;
-       uint8_t num_channels;
-       uint8_t chan_byte_size;
-       uint8_t chan_format;
+   uint8_t element_size;
+   uint8_t num_channels;
+   uint8_t chan_byte_size;
+   uint8_t chan_format;
 };
 
 struct ac_spi_color_formats {
-       unsigned normal : 8;
-       unsigned alpha : 8;
-       unsigned blend : 8;
-       unsigned blend_alpha : 8;
+   unsigned normal : 8;
+   unsigned alpha : 8;
+   unsigned blend : 8;
+   unsigned blend_alpha : 8;
 };
 
-unsigned
-ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil,
-                          bool writes_samplemask);
+unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask);
 
-unsigned
-ac_get_cb_shader_mask(unsigned spi_shader_col_format);
+unsigned ac_get_cb_shader_mask(unsigned spi_shader_col_format);
 
-uint32_t
-ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class);
+uint32_t ac_vgt_gs_mode(unsigned gs_max_vert_out, enum chip_class chip_class);
 
-unsigned
-ac_get_tbuffer_format(enum chip_class chip_class,
-                     unsigned dfmt, unsigned nfmt);
+unsigned ac_get_tbuffer_format(enum chip_class chip_class, unsigned dfmt, unsigned nfmt);
 
-const struct ac_data_format_info *
-ac_get_data_format_info(unsigned dfmt);
+const struct ac_data_format_info *ac_get_data_format_info(unsigned dfmt);
 
-enum ac_image_dim
-ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim,
-                  bool is_array);
+enum ac_image_dim ac_get_sampler_dim(enum chip_class chip_class, enum glsl_sampler_dim dim,
+                                     bool is_array);
 
-enum ac_image_dim
-ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim,
-                bool is_array);
+enum ac_image_dim ac_get_image_dim(enum chip_class chip_class, enum glsl_sampler_dim sdim,
+                                   bool is_array);
 
-unsigned
-ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
-                        signed char *face_vgpr_index,
-                        signed char *ancillary_vgpr_index);
+unsigned ac_get_fs_input_vgpr_cnt(const struct ac_shader_config *config,
+                                  signed char *face_vgpr_index, signed char *ancillary_vgpr_index);
 
-void ac_choose_spi_color_formats(unsigned format, unsigned swap,
-                                unsigned ntype, bool is_depth,
-                                struct ac_spi_color_formats *formats);
+void ac_choose_spi_color_formats(unsigned format, unsigned swap, unsigned ntype, bool is_depth,
+                                 struct ac_spi_color_formats *formats);
 
 #ifdef __cplusplus
 }
index d08ccf0765f9e342dd61bce95cbc055c694f66ff..1ef2df5afb761d44f2fc0bd73e6eddefdcd3ad70 100644 (file)
  */
 
 #include "ac_shadowed_regs.h"
+
 #include "ac_debug.h"
 #include "sid.h"
 #include "util/macros.h"
 #include "util/u_debug.h"
+
 #include <stdio.h>
 
 static const struct ac_reg_range Gfx9UserConfigShadowRange[] = {
@@ -522,7 +524,8 @@ static const struct ac_reg_range Navi10NonShadowedRanges[] = {
       VGT_DMA_PRIMITIVE_TYPE,
       VGT_DMA_LS_HS_CONFIG - VGT_DMA_PRIMITIVE_TYPE + 4,
    },*/
-   /* VGT_INDEX_TYPE and VGT_DMA_INDEX_TYPE are a special case and neither of these should be shadowed. */
+   /* VGT_INDEX_TYPE and VGT_DMA_INDEX_TYPE are a special case and neither of these should be
+      shadowed. */
    {
       R_028A7C_VGT_DMA_INDEX_TYPE,
       4,
@@ -731,7 +734,8 @@ static const struct ac_reg_range Gfx103NonShadowedRanges[] = {
       VGT_DMA_PRIMITIVE_TYPE,
       VGT_DMA_LS_HS_CONFIG - VGT_DMA_PRIMITIVE_TYPE + 4,
    },*/
-   /* VGT_INDEX_TYPE and VGT_DMA_INDEX_TYPE are a special case and neither of these should be shadowed. */
+   /* VGT_INDEX_TYPE and VGT_DMA_INDEX_TYPE are a special case and neither of these should be
+      shadowed. */
    {
       R_028A7C_VGT_DMA_INDEX_TYPE,
       4,
@@ -816,7 +820,11 @@ void ac_get_reg_ranges(enum chip_class chip_class, enum radeon_family family,
                        enum ac_reg_range_type type, unsigned *num_ranges,
                        const struct ac_reg_range **ranges)
 {
-#define RETURN(array) do { *ranges = array; *num_ranges = ARRAY_SIZE(array); } while (0)
+#define RETURN(array)                                                                              \
+   do {                                                                                            \
+      *ranges = array;                                                                             \
+      *num_ranges = ARRAY_SIZE(array);                                                             \
+   } while (0)
 
    *num_ranges = 0;
    *ranges = NULL;
@@ -841,8 +849,7 @@ void ac_get_reg_ranges(enum chip_class chip_class, enum radeon_family family,
    case SI_REG_RANGE_SH:
       if (chip_class == GFX10_3 || chip_class == GFX10)
          RETURN(Gfx10ShShadowRange);
-      else if (family == CHIP_RAVEN2 ||
-               family == CHIP_RENOIR)
+      else if (family == CHIP_RAVEN2 || family == CHIP_RENOIR)
          RETURN(Gfx9ShShadowRangeRaven2);
       else if (chip_class == GFX9)
          RETURN(Gfx9ShShadowRange);
@@ -850,8 +857,7 @@ void ac_get_reg_ranges(enum chip_class chip_class, enum radeon_family family,
    case SI_REG_RANGE_CS_SH:
       if (chip_class == GFX10_3 || chip_class == GFX10)
          RETURN(Gfx10CsShShadowRange);
-      else if (family == CHIP_RAVEN2 ||
-               family == CHIP_RENOIR)
+      else if (family == CHIP_RAVEN2 || family == CHIP_RENOIR)
          RETURN(Gfx9CsShShadowRangeRaven2);
       else if (chip_class == GFX9)
          RETURN(Gfx9CsShShadowRange);
@@ -876,68 +882,68 @@ static void gfx9_emulate_clear_state(struct radeon_cmdbuf *cs,
                                      set_context_reg_seq_array_fn set_context_reg_seq_array)
 {
    static const uint32_t DbRenderControlGfx9[] = {
-      0x0       , // DB_RENDER_CONTROL
-      0x0       , // DB_COUNT_CONTROL
-      0x0       , // DB_DEPTH_VIEW
-      0x0       , // DB_RENDER_OVERRIDE
-      0x0       , // DB_RENDER_OVERRIDE2
-      0x0       , // DB_HTILE_DATA_BASE
-      0x0       , // DB_HTILE_DATA_BASE_HI
-      0x0       , // DB_DEPTH_SIZE
-      0x0       , // DB_DEPTH_BOUNDS_MIN
-      0x0       , // DB_DEPTH_BOUNDS_MAX
-      0x0       , // DB_STENCIL_CLEAR
-      0x0       , // DB_DEPTH_CLEAR
-      0x0       , // PA_SC_SCREEN_SCISSOR_TL
+      0x0,        // DB_RENDER_CONTROL
+      0x0,        // DB_COUNT_CONTROL
+      0x0,        // DB_DEPTH_VIEW
+      0x0,        // DB_RENDER_OVERRIDE
+      0x0,        // DB_RENDER_OVERRIDE2
+      0x0,        // DB_HTILE_DATA_BASE
+      0x0,        // DB_HTILE_DATA_BASE_HI
+      0x0,        // DB_DEPTH_SIZE
+      0x0,        // DB_DEPTH_BOUNDS_MIN
+      0x0,        // DB_DEPTH_BOUNDS_MAX
+      0x0,        // DB_STENCIL_CLEAR
+      0x0,        // DB_DEPTH_CLEAR
+      0x0,        // PA_SC_SCREEN_SCISSOR_TL
       0x40004000, // PA_SC_SCREEN_SCISSOR_BR
-      0x0       , // DB_Z_INFO
-      0x0       , // DB_STENCIL_INFO
-      0x0       , // DB_Z_READ_BASE
-      0x0       , // DB_Z_READ_BASE_HI
-      0x0       , // DB_STENCIL_READ_BASE
-      0x0       , // DB_STENCIL_READ_BASE_HI
-      0x0       , // DB_Z_WRITE_BASE
-      0x0       , // DB_Z_WRITE_BASE_HI
-      0x0       , // DB_STENCIL_WRITE_BASE
-      0x0       , // DB_STENCIL_WRITE_BASE_HI
-      0x0       , // DB_DFSM_CONTROL
-      0x0       , //
-      0x0       , // DB_Z_INFO2
-      0x0       , // DB_STENCIL_INFO2
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , // TA_BC_BASE_ADDR
+      0x0,        // DB_Z_INFO
+      0x0,        // DB_STENCIL_INFO
+      0x0,        // DB_Z_READ_BASE
+      0x0,        // DB_Z_READ_BASE_HI
+      0x0,        // DB_STENCIL_READ_BASE
+      0x0,        // DB_STENCIL_READ_BASE_HI
+      0x0,        // DB_Z_WRITE_BASE
+      0x0,        // DB_Z_WRITE_BASE_HI
+      0x0,        // DB_STENCIL_WRITE_BASE
+      0x0,        // DB_STENCIL_WRITE_BASE_HI
+      0x0,        // DB_DFSM_CONTROL
+      0x0,        //
+      0x0,        // DB_Z_INFO2
+      0x0,        // DB_STENCIL_INFO2
+      0x0,        //
+      0x0,        //
+      0x0,        //
+      0x0,        //
+      0x0,        // TA_BC_BASE_ADDR
       0x0         // TA_BC_BASE_ADDR_HI
    };
    static const uint32_t CoherDestBaseHi0Gfx9[] = {
-      0x0       , // COHER_DEST_BASE_HI_0
-      0x0       , // COHER_DEST_BASE_HI_1
-      0x0       , // COHER_DEST_BASE_HI_2
-      0x0       , // COHER_DEST_BASE_HI_3
-      0x0       , // COHER_DEST_BASE_2
-      0x0       , // COHER_DEST_BASE_3
-      0x0       , // PA_SC_WINDOW_OFFSET
+      0x0,        // COHER_DEST_BASE_HI_0
+      0x0,        // COHER_DEST_BASE_HI_1
+      0x0,        // COHER_DEST_BASE_HI_2
+      0x0,        // COHER_DEST_BASE_HI_3
+      0x0,        // COHER_DEST_BASE_2
+      0x0,        // COHER_DEST_BASE_3
+      0x0,        // PA_SC_WINDOW_OFFSET
       0x80000000, // PA_SC_WINDOW_SCISSOR_TL
       0x40004000, // PA_SC_WINDOW_SCISSOR_BR
-      0xffff    , // PA_SC_CLIPRECT_RULE
-      0x0       , // PA_SC_CLIPRECT_0_TL
+      0xffff,     // PA_SC_CLIPRECT_RULE
+      0x0,        // PA_SC_CLIPRECT_0_TL
       0x40004000, // PA_SC_CLIPRECT_0_BR
-      0x0       , // PA_SC_CLIPRECT_1_TL
+      0x0,        // PA_SC_CLIPRECT_1_TL
       0x40004000, // PA_SC_CLIPRECT_1_BR
-      0x0       , // PA_SC_CLIPRECT_2_TL
+      0x0,        // PA_SC_CLIPRECT_2_TL
       0x40004000, // PA_SC_CLIPRECT_2_BR
-      0x0       , // PA_SC_CLIPRECT_3_TL
+      0x0,        // PA_SC_CLIPRECT_3_TL
       0x40004000, // PA_SC_CLIPRECT_3_BR
       0xaa99aaaa, // PA_SC_EDGERULE
-      0x0       , // PA_SU_HARDWARE_SCREEN_OFFSET
+      0x0,        // PA_SU_HARDWARE_SCREEN_OFFSET
       0xffffffff, // CB_TARGET_MASK
       0xffffffff, // CB_SHADER_MASK
       0x80000000, // PA_SC_GENERIC_SCISSOR_TL
       0x40004000, // PA_SC_GENERIC_SCISSOR_BR
-      0x0       , // COHER_DEST_BASE_0
-      0x0       , // COHER_DEST_BASE_1
+      0x0,        // COHER_DEST_BASE_0
+      0x0,        // COHER_DEST_BASE_1
       0x80000000, // PA_SC_VPORT_SCISSOR_0_TL
       0x40004000, // PA_SC_VPORT_SCISSOR_0_BR
       0x80000000, // PA_SC_VPORT_SCISSOR_1_TL
@@ -970,529 +976,529 @@ static void gfx9_emulate_clear_state(struct radeon_cmdbuf *cs,
       0x40004000, // PA_SC_VPORT_SCISSOR_14_BR
       0x80000000, // PA_SC_VPORT_SCISSOR_15_TL
       0x40004000, // PA_SC_VPORT_SCISSOR_15_BR
-      0x0       , // PA_SC_VPORT_ZMIN_0
+      0x0,        // PA_SC_VPORT_ZMIN_0
       0x3f800000, // PA_SC_VPORT_ZMAX_0
-      0x0       , // PA_SC_VPORT_ZMIN_1
+      0x0,        // PA_SC_VPORT_ZMIN_1
       0x3f800000, // PA_SC_VPORT_ZMAX_1
-      0x0       , // PA_SC_VPORT_ZMIN_2
+      0x0,        // PA_SC_VPORT_ZMIN_2
       0x3f800000, // PA_SC_VPORT_ZMAX_2
-      0x0       , // PA_SC_VPORT_ZMIN_3
+      0x0,        // PA_SC_VPORT_ZMIN_3
       0x3f800000, // PA_SC_VPORT_ZMAX_3
-      0x0       , // PA_SC_VPORT_ZMIN_4
+      0x0,        // PA_SC_VPORT_ZMIN_4
       0x3f800000, // PA_SC_VPORT_ZMAX_4
-      0x0       , // PA_SC_VPORT_ZMIN_5
+      0x0,        // PA_SC_VPORT_ZMIN_5
       0x3f800000, // PA_SC_VPORT_ZMAX_5
-      0x0       , // PA_SC_VPORT_ZMIN_6
+      0x0,        // PA_SC_VPORT_ZMIN_6
       0x3f800000, // PA_SC_VPORT_ZMAX_6
-      0x0       , // PA_SC_VPORT_ZMIN_7
+      0x0,        // PA_SC_VPORT_ZMIN_7
       0x3f800000, // PA_SC_VPORT_ZMAX_7
-      0x0       , // PA_SC_VPORT_ZMIN_8
+      0x0,        // PA_SC_VPORT_ZMIN_8
       0x3f800000, // PA_SC_VPORT_ZMAX_8
-      0x0       , // PA_SC_VPORT_ZMIN_9
+      0x0,        // PA_SC_VPORT_ZMIN_9
       0x3f800000, // PA_SC_VPORT_ZMAX_9
-      0x0       , // PA_SC_VPORT_ZMIN_10
+      0x0,        // PA_SC_VPORT_ZMIN_10
       0x3f800000, // PA_SC_VPORT_ZMAX_10
-      0x0       , // PA_SC_VPORT_ZMIN_11
+      0x0,        // PA_SC_VPORT_ZMIN_11
       0x3f800000, // PA_SC_VPORT_ZMAX_11
-      0x0       , // PA_SC_VPORT_ZMIN_12
+      0x0,        // PA_SC_VPORT_ZMIN_12
       0x3f800000, // PA_SC_VPORT_ZMAX_12
-      0x0       , // PA_SC_VPORT_ZMIN_13
+      0x0,        // PA_SC_VPORT_ZMIN_13
       0x3f800000, // PA_SC_VPORT_ZMAX_13
-      0x0       , // PA_SC_VPORT_ZMIN_14
+      0x0,        // PA_SC_VPORT_ZMIN_14
       0x3f800000, // PA_SC_VPORT_ZMAX_14
-      0x0       , // PA_SC_VPORT_ZMIN_15
+      0x0,        // PA_SC_VPORT_ZMIN_15
       0x3f800000, // PA_SC_VPORT_ZMAX_15
-      0x0       , // PA_SC_RASTER_CONFIG
-      0x0       , // PA_SC_RASTER_CONFIG_1
-      0x0       , //
+      0x0,        // PA_SC_RASTER_CONFIG
+      0x0,        // PA_SC_RASTER_CONFIG_1
+      0x0,        //
       0x0         // PA_SC_TILE_STEERING_OVERRIDE
    };
    static const uint32_t VgtMultiPrimIbResetIndxGfx9[] = {
-      0x0         // VGT_MULTI_PRIM_IB_RESET_INDX
+      0x0 // VGT_MULTI_PRIM_IB_RESET_INDX
    };
    static const uint32_t CbBlendRedGfx9[] = {
-      0x0       , // CB_BLEND_RED
-      0x0       , // CB_BLEND_GREEN
-      0x0       , // CB_BLEND_BLUE
-      0x0       , // CB_BLEND_ALPHA
-      0x0       , // CB_DCC_CONTROL
-      0x0       , //
-      0x0       , // DB_STENCIL_CONTROL
-      0x1000000 , // DB_STENCILREFMASK
-      0x1000000 , // DB_STENCILREFMASK_BF
-      0x0       , //
-      0x0       , // PA_CL_VPORT_XSCALE
-      0x0       , // PA_CL_VPORT_XOFFSET
-      0x0       , // PA_CL_VPORT_YSCALE
-      0x0       , // PA_CL_VPORT_YOFFSET
-      0x0       , // PA_CL_VPORT_ZSCALE
-      0x0       , // PA_CL_VPORT_ZOFFSET
-      0x0       , // PA_CL_VPORT_XSCALE_1
-      0x0       , // PA_CL_VPORT_XOFFSET_1
-      0x0       , // PA_CL_VPORT_YSCALE_1
-      0x0       , // PA_CL_VPORT_YOFFSET_1
-      0x0       , // PA_CL_VPORT_ZSCALE_1
-      0x0       , // PA_CL_VPORT_ZOFFSET_1
-      0x0       , // PA_CL_VPORT_XSCALE_2
-      0x0       , // PA_CL_VPORT_XOFFSET_2
-      0x0       , // PA_CL_VPORT_YSCALE_2
-      0x0       , // PA_CL_VPORT_YOFFSET_2
-      0x0       , // PA_CL_VPORT_ZSCALE_2
-      0x0       , // PA_CL_VPORT_ZOFFSET_2
-      0x0       , // PA_CL_VPORT_XSCALE_3
-      0x0       , // PA_CL_VPORT_XOFFSET_3
-      0x0       , // PA_CL_VPORT_YSCALE_3
-      0x0       , // PA_CL_VPORT_YOFFSET_3
-      0x0       , // PA_CL_VPORT_ZSCALE_3
-      0x0       , // PA_CL_VPORT_ZOFFSET_3
-      0x0       , // PA_CL_VPORT_XSCALE_4
-      0x0       , // PA_CL_VPORT_XOFFSET_4
-      0x0       , // PA_CL_VPORT_YSCALE_4
-      0x0       , // PA_CL_VPORT_YOFFSET_4
-      0x0       , // PA_CL_VPORT_ZSCALE_4
-      0x0       , // PA_CL_VPORT_ZOFFSET_4
-      0x0       , // PA_CL_VPORT_XSCALE_5
-      0x0       , // PA_CL_VPORT_XOFFSET_5
-      0x0       , // PA_CL_VPORT_YSCALE_5
-      0x0       , // PA_CL_VPORT_YOFFSET_5
-      0x0       , // PA_CL_VPORT_ZSCALE_5
-      0x0       , // PA_CL_VPORT_ZOFFSET_5
-      0x0       , // PA_CL_VPORT_XSCALE_6
-      0x0       , // PA_CL_VPORT_XOFFSET_6
-      0x0       , // PA_CL_VPORT_YSCALE_6
-      0x0       , // PA_CL_VPORT_YOFFSET_6
-      0x0       , // PA_CL_VPORT_ZSCALE_6
-      0x0       , // PA_CL_VPORT_ZOFFSET_6
-      0x0       , // PA_CL_VPORT_XSCALE_7
-      0x0       , // PA_CL_VPORT_XOFFSET_7
-      0x0       , // PA_CL_VPORT_YSCALE_7
-      0x0       , // PA_CL_VPORT_YOFFSET_7
-      0x0       , // PA_CL_VPORT_ZSCALE_7
-      0x0       , // PA_CL_VPORT_ZOFFSET_7
-      0x0       , // PA_CL_VPORT_XSCALE_8
-      0x0       , // PA_CL_VPORT_XOFFSET_8
-      0x0       , // PA_CL_VPORT_YSCALE_8
-      0x0       , // PA_CL_VPORT_YOFFSET_8
-      0x0       , // PA_CL_VPORT_ZSCALE_8
-      0x0       , // PA_CL_VPORT_ZOFFSET_8
-      0x0       , // PA_CL_VPORT_XSCALE_9
-      0x0       , // PA_CL_VPORT_XOFFSET_9
-      0x0       , // PA_CL_VPORT_YSCALE_9
-      0x0       , // PA_CL_VPORT_YOFFSET_9
-      0x0       , // PA_CL_VPORT_ZSCALE_9
-      0x0       , // PA_CL_VPORT_ZOFFSET_9
-      0x0       , // PA_CL_VPORT_XSCALE_10
-      0x0       , // PA_CL_VPORT_XOFFSET_10
-      0x0       , // PA_CL_VPORT_YSCALE_10
-      0x0       , // PA_CL_VPORT_YOFFSET_10
-      0x0       , // PA_CL_VPORT_ZSCALE_10
-      0x0       , // PA_CL_VPORT_ZOFFSET_10
-      0x0       , // PA_CL_VPORT_XSCALE_11
-      0x0       , // PA_CL_VPORT_XOFFSET_11
-      0x0       , // PA_CL_VPORT_YSCALE_11
-      0x0       , // PA_CL_VPORT_YOFFSET_11
-      0x0       , // PA_CL_VPORT_ZSCALE_11
-      0x0       , // PA_CL_VPORT_ZOFFSET_11
-      0x0       , // PA_CL_VPORT_XSCALE_12
-      0x0       , // PA_CL_VPORT_XOFFSET_12
-      0x0       , // PA_CL_VPORT_YSCALE_12
-      0x0       , // PA_CL_VPORT_YOFFSET_12
-      0x0       , // PA_CL_VPORT_ZSCALE_12
-      0x0       , // PA_CL_VPORT_ZOFFSET_12
-      0x0       , // PA_CL_VPORT_XSCALE_13
-      0x0       , // PA_CL_VPORT_XOFFSET_13
-      0x0       , // PA_CL_VPORT_YSCALE_13
-      0x0       , // PA_CL_VPORT_YOFFSET_13
-      0x0       , // PA_CL_VPORT_ZSCALE_13
-      0x0       , // PA_CL_VPORT_ZOFFSET_13
-      0x0       , // PA_CL_VPORT_XSCALE_14
-      0x0       , // PA_CL_VPORT_XOFFSET_14
-      0x0       , // PA_CL_VPORT_YSCALE_14
-      0x0       , // PA_CL_VPORT_YOFFSET_14
-      0x0       , // PA_CL_VPORT_ZSCALE_14
-      0x0       , // PA_CL_VPORT_ZOFFSET_14
-      0x0       , // PA_CL_VPORT_XSCALE_15
-      0x0       , // PA_CL_VPORT_XOFFSET_15
-      0x0       , // PA_CL_VPORT_YSCALE_15
-      0x0       , // PA_CL_VPORT_YOFFSET_15
-      0x0       , // PA_CL_VPORT_ZSCALE_15
-      0x0       , // PA_CL_VPORT_ZOFFSET_15
-      0x0       , // PA_CL_UCP_0_X
-      0x0       , // PA_CL_UCP_0_Y
-      0x0       , // PA_CL_UCP_0_Z
-      0x0       , // PA_CL_UCP_0_W
-      0x0       , // PA_CL_UCP_1_X
-      0x0       , // PA_CL_UCP_1_Y
-      0x0       , // PA_CL_UCP_1_Z
-      0x0       , // PA_CL_UCP_1_W
-      0x0       , // PA_CL_UCP_2_X
-      0x0       , // PA_CL_UCP_2_Y
-      0x0       , // PA_CL_UCP_2_Z
-      0x0       , // PA_CL_UCP_2_W
-      0x0       , // PA_CL_UCP_3_X
-      0x0       , // PA_CL_UCP_3_Y
-      0x0       , // PA_CL_UCP_3_Z
-      0x0       , // PA_CL_UCP_3_W
-      0x0       , // PA_CL_UCP_4_X
-      0x0       , // PA_CL_UCP_4_Y
-      0x0       , // PA_CL_UCP_4_Z
-      0x0       , // PA_CL_UCP_4_W
-      0x0       , // PA_CL_UCP_5_X
-      0x0       , // PA_CL_UCP_5_Y
-      0x0       , // PA_CL_UCP_5_Z
-      0x0         // PA_CL_UCP_5_W
+      0x0,       // CB_BLEND_RED
+      0x0,       // CB_BLEND_GREEN
+      0x0,       // CB_BLEND_BLUE
+      0x0,       // CB_BLEND_ALPHA
+      0x0,       // CB_DCC_CONTROL
+      0x0,       //
+      0x0,       // DB_STENCIL_CONTROL
+      0x1000000, // DB_STENCILREFMASK
+      0x1000000, // DB_STENCILREFMASK_BF
+      0x0,       //
+      0x0,       // PA_CL_VPORT_XSCALE
+      0x0,       // PA_CL_VPORT_XOFFSET
+      0x0,       // PA_CL_VPORT_YSCALE
+      0x0,       // PA_CL_VPORT_YOFFSET
+      0x0,       // PA_CL_VPORT_ZSCALE
+      0x0,       // PA_CL_VPORT_ZOFFSET
+      0x0,       // PA_CL_VPORT_XSCALE_1
+      0x0,       // PA_CL_VPORT_XOFFSET_1
+      0x0,       // PA_CL_VPORT_YSCALE_1
+      0x0,       // PA_CL_VPORT_YOFFSET_1
+      0x0,       // PA_CL_VPORT_ZSCALE_1
+      0x0,       // PA_CL_VPORT_ZOFFSET_1
+      0x0,       // PA_CL_VPORT_XSCALE_2
+      0x0,       // PA_CL_VPORT_XOFFSET_2
+      0x0,       // PA_CL_VPORT_YSCALE_2
+      0x0,       // PA_CL_VPORT_YOFFSET_2
+      0x0,       // PA_CL_VPORT_ZSCALE_2
+      0x0,       // PA_CL_VPORT_ZOFFSET_2
+      0x0,       // PA_CL_VPORT_XSCALE_3
+      0x0,       // PA_CL_VPORT_XOFFSET_3
+      0x0,       // PA_CL_VPORT_YSCALE_3
+      0x0,       // PA_CL_VPORT_YOFFSET_3
+      0x0,       // PA_CL_VPORT_ZSCALE_3
+      0x0,       // PA_CL_VPORT_ZOFFSET_3
+      0x0,       // PA_CL_VPORT_XSCALE_4
+      0x0,       // PA_CL_VPORT_XOFFSET_4
+      0x0,       // PA_CL_VPORT_YSCALE_4
+      0x0,       // PA_CL_VPORT_YOFFSET_4
+      0x0,       // PA_CL_VPORT_ZSCALE_4
+      0x0,       // PA_CL_VPORT_ZOFFSET_4
+      0x0,       // PA_CL_VPORT_XSCALE_5
+      0x0,       // PA_CL_VPORT_XOFFSET_5
+      0x0,       // PA_CL_VPORT_YSCALE_5
+      0x0,       // PA_CL_VPORT_YOFFSET_5
+      0x0,       // PA_CL_VPORT_ZSCALE_5
+      0x0,       // PA_CL_VPORT_ZOFFSET_5
+      0x0,       // PA_CL_VPORT_XSCALE_6
+      0x0,       // PA_CL_VPORT_XOFFSET_6
+      0x0,       // PA_CL_VPORT_YSCALE_6
+      0x0,       // PA_CL_VPORT_YOFFSET_6
+      0x0,       // PA_CL_VPORT_ZSCALE_6
+      0x0,       // PA_CL_VPORT_ZOFFSET_6
+      0x0,       // PA_CL_VPORT_XSCALE_7
+      0x0,       // PA_CL_VPORT_XOFFSET_7
+      0x0,       // PA_CL_VPORT_YSCALE_7
+      0x0,       // PA_CL_VPORT_YOFFSET_7
+      0x0,       // PA_CL_VPORT_ZSCALE_7
+      0x0,       // PA_CL_VPORT_ZOFFSET_7
+      0x0,       // PA_CL_VPORT_XSCALE_8
+      0x0,       // PA_CL_VPORT_XOFFSET_8
+      0x0,       // PA_CL_VPORT_YSCALE_8
+      0x0,       // PA_CL_VPORT_YOFFSET_8
+      0x0,       // PA_CL_VPORT_ZSCALE_8
+      0x0,       // PA_CL_VPORT_ZOFFSET_8
+      0x0,       // PA_CL_VPORT_XSCALE_9
+      0x0,       // PA_CL_VPORT_XOFFSET_9
+      0x0,       // PA_CL_VPORT_YSCALE_9
+      0x0,       // PA_CL_VPORT_YOFFSET_9
+      0x0,       // PA_CL_VPORT_ZSCALE_9
+      0x0,       // PA_CL_VPORT_ZOFFSET_9
+      0x0,       // PA_CL_VPORT_XSCALE_10
+      0x0,       // PA_CL_VPORT_XOFFSET_10
+      0x0,       // PA_CL_VPORT_YSCALE_10
+      0x0,       // PA_CL_VPORT_YOFFSET_10
+      0x0,       // PA_CL_VPORT_ZSCALE_10
+      0x0,       // PA_CL_VPORT_ZOFFSET_10
+      0x0,       // PA_CL_VPORT_XSCALE_11
+      0x0,       // PA_CL_VPORT_XOFFSET_11
+      0x0,       // PA_CL_VPORT_YSCALE_11
+      0x0,       // PA_CL_VPORT_YOFFSET_11
+      0x0,       // PA_CL_VPORT_ZSCALE_11
+      0x0,       // PA_CL_VPORT_ZOFFSET_11
+      0x0,       // PA_CL_VPORT_XSCALE_12
+      0x0,       // PA_CL_VPORT_XOFFSET_12
+      0x0,       // PA_CL_VPORT_YSCALE_12
+      0x0,       // PA_CL_VPORT_YOFFSET_12
+      0x0,       // PA_CL_VPORT_ZSCALE_12
+      0x0,       // PA_CL_VPORT_ZOFFSET_12
+      0x0,       // PA_CL_VPORT_XSCALE_13
+      0x0,       // PA_CL_VPORT_XOFFSET_13
+      0x0,       // PA_CL_VPORT_YSCALE_13
+      0x0,       // PA_CL_VPORT_YOFFSET_13
+      0x0,       // PA_CL_VPORT_ZSCALE_13
+      0x0,       // PA_CL_VPORT_ZOFFSET_13
+      0x0,       // PA_CL_VPORT_XSCALE_14
+      0x0,       // PA_CL_VPORT_XOFFSET_14
+      0x0,       // PA_CL_VPORT_YSCALE_14
+      0x0,       // PA_CL_VPORT_YOFFSET_14
+      0x0,       // PA_CL_VPORT_ZSCALE_14
+      0x0,       // PA_CL_VPORT_ZOFFSET_14
+      0x0,       // PA_CL_VPORT_XSCALE_15
+      0x0,       // PA_CL_VPORT_XOFFSET_15
+      0x0,       // PA_CL_VPORT_YSCALE_15
+      0x0,       // PA_CL_VPORT_YOFFSET_15
+      0x0,       // PA_CL_VPORT_ZSCALE_15
+      0x0,       // PA_CL_VPORT_ZOFFSET_15
+      0x0,       // PA_CL_UCP_0_X
+      0x0,       // PA_CL_UCP_0_Y
+      0x0,       // PA_CL_UCP_0_Z
+      0x0,       // PA_CL_UCP_0_W
+      0x0,       // PA_CL_UCP_1_X
+      0x0,       // PA_CL_UCP_1_Y
+      0x0,       // PA_CL_UCP_1_Z
+      0x0,       // PA_CL_UCP_1_W
+      0x0,       // PA_CL_UCP_2_X
+      0x0,       // PA_CL_UCP_2_Y
+      0x0,       // PA_CL_UCP_2_Z
+      0x0,       // PA_CL_UCP_2_W
+      0x0,       // PA_CL_UCP_3_X
+      0x0,       // PA_CL_UCP_3_Y
+      0x0,       // PA_CL_UCP_3_Z
+      0x0,       // PA_CL_UCP_3_W
+      0x0,       // PA_CL_UCP_4_X
+      0x0,       // PA_CL_UCP_4_Y
+      0x0,       // PA_CL_UCP_4_Z
+      0x0,       // PA_CL_UCP_4_W
+      0x0,       // PA_CL_UCP_5_X
+      0x0,       // PA_CL_UCP_5_Y
+      0x0,       // PA_CL_UCP_5_Z
+      0x0        // PA_CL_UCP_5_W
    };
    static const uint32_t SpiPsInputCntl0Gfx9[] = {
-      0x0       , // SPI_PS_INPUT_CNTL_0
-      0x0       , // SPI_PS_INPUT_CNTL_1
-      0x0       , // SPI_PS_INPUT_CNTL_2
-      0x0       , // SPI_PS_INPUT_CNTL_3
-      0x0       , // SPI_PS_INPUT_CNTL_4
-      0x0       , // SPI_PS_INPUT_CNTL_5
-      0x0       , // SPI_PS_INPUT_CNTL_6
-      0x0       , // SPI_PS_INPUT_CNTL_7
-      0x0       , // SPI_PS_INPUT_CNTL_8
-      0x0       , // SPI_PS_INPUT_CNTL_9
-      0x0       , // SPI_PS_INPUT_CNTL_10
-      0x0       , // SPI_PS_INPUT_CNTL_11
-      0x0       , // SPI_PS_INPUT_CNTL_12
-      0x0       , // SPI_PS_INPUT_CNTL_13
-      0x0       , // SPI_PS_INPUT_CNTL_14
-      0x0       , // SPI_PS_INPUT_CNTL_15
-      0x0       , // SPI_PS_INPUT_CNTL_16
-      0x0       , // SPI_PS_INPUT_CNTL_17
-      0x0       , // SPI_PS_INPUT_CNTL_18
-      0x0       , // SPI_PS_INPUT_CNTL_19
-      0x0       , // SPI_PS_INPUT_CNTL_20
-      0x0       , // SPI_PS_INPUT_CNTL_21
-      0x0       , // SPI_PS_INPUT_CNTL_22
-      0x0       , // SPI_PS_INPUT_CNTL_23
-      0x0       , // SPI_PS_INPUT_CNTL_24
-      0x0       , // SPI_PS_INPUT_CNTL_25
-      0x0       , // SPI_PS_INPUT_CNTL_26
-      0x0       , // SPI_PS_INPUT_CNTL_27
-      0x0       , // SPI_PS_INPUT_CNTL_28
-      0x0       , // SPI_PS_INPUT_CNTL_29
-      0x0       , // SPI_PS_INPUT_CNTL_30
-      0x0       , // SPI_PS_INPUT_CNTL_31
-      0x0       , // SPI_VS_OUT_CONFIG
-      0x0       , //
-      0x0       , // SPI_PS_INPUT_ENA
-      0x0       , // SPI_PS_INPUT_ADDR
-      0x0       , // SPI_INTERP_CONTROL_0
-      0x2       , // SPI_PS_IN_CONTROL
-      0x0       , //
-      0x0       , // SPI_BARYC_CNTL
-      0x0       , //
-      0x0       , // SPI_TMPRING_SIZE
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , // SPI_SHADER_POS_FORMAT
-      0x0       , // SPI_SHADER_Z_FORMAT
-      0x0         // SPI_SHADER_COL_FORMAT
+      0x0, // SPI_PS_INPUT_CNTL_0
+      0x0, // SPI_PS_INPUT_CNTL_1
+      0x0, // SPI_PS_INPUT_CNTL_2
+      0x0, // SPI_PS_INPUT_CNTL_3
+      0x0, // SPI_PS_INPUT_CNTL_4
+      0x0, // SPI_PS_INPUT_CNTL_5
+      0x0, // SPI_PS_INPUT_CNTL_6
+      0x0, // SPI_PS_INPUT_CNTL_7
+      0x0, // SPI_PS_INPUT_CNTL_8
+      0x0, // SPI_PS_INPUT_CNTL_9
+      0x0, // SPI_PS_INPUT_CNTL_10
+      0x0, // SPI_PS_INPUT_CNTL_11
+      0x0, // SPI_PS_INPUT_CNTL_12
+      0x0, // SPI_PS_INPUT_CNTL_13
+      0x0, // SPI_PS_INPUT_CNTL_14
+      0x0, // SPI_PS_INPUT_CNTL_15
+      0x0, // SPI_PS_INPUT_CNTL_16
+      0x0, // SPI_PS_INPUT_CNTL_17
+      0x0, // SPI_PS_INPUT_CNTL_18
+      0x0, // SPI_PS_INPUT_CNTL_19
+      0x0, // SPI_PS_INPUT_CNTL_20
+      0x0, // SPI_PS_INPUT_CNTL_21
+      0x0, // SPI_PS_INPUT_CNTL_22
+      0x0, // SPI_PS_INPUT_CNTL_23
+      0x0, // SPI_PS_INPUT_CNTL_24
+      0x0, // SPI_PS_INPUT_CNTL_25
+      0x0, // SPI_PS_INPUT_CNTL_26
+      0x0, // SPI_PS_INPUT_CNTL_27
+      0x0, // SPI_PS_INPUT_CNTL_28
+      0x0, // SPI_PS_INPUT_CNTL_29
+      0x0, // SPI_PS_INPUT_CNTL_30
+      0x0, // SPI_PS_INPUT_CNTL_31
+      0x0, // SPI_VS_OUT_CONFIG
+      0x0, //
+      0x0, // SPI_PS_INPUT_ENA
+      0x0, // SPI_PS_INPUT_ADDR
+      0x0, // SPI_INTERP_CONTROL_0
+      0x2, // SPI_PS_IN_CONTROL
+      0x0, //
+      0x0, // SPI_BARYC_CNTL
+      0x0, //
+      0x0, // SPI_TMPRING_SIZE
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, // SPI_SHADER_POS_FORMAT
+      0x0, // SPI_SHADER_Z_FORMAT
+      0x0  // SPI_SHADER_COL_FORMAT
    };
    static const uint32_t SxPsDownconvertGfx9[] = {
-      0x0       , // SX_PS_DOWNCONVERT
-      0x0       , // SX_BLEND_OPT_EPSILON
-      0x0       , // SX_BLEND_OPT_CONTROL
-      0x0       , // SX_MRT0_BLEND_OPT
-      0x0       , // SX_MRT1_BLEND_OPT
-      0x0       , // SX_MRT2_BLEND_OPT
-      0x0       , // SX_MRT3_BLEND_OPT
-      0x0       , // SX_MRT4_BLEND_OPT
-      0x0       , // SX_MRT5_BLEND_OPT
-      0x0       , // SX_MRT6_BLEND_OPT
-      0x0       , // SX_MRT7_BLEND_OPT
-      0x0       , // CB_BLEND0_CONTROL
-      0x0       , // CB_BLEND1_CONTROL
-      0x0       , // CB_BLEND2_CONTROL
-      0x0       , // CB_BLEND3_CONTROL
-      0x0       , // CB_BLEND4_CONTROL
-      0x0       , // CB_BLEND5_CONTROL
-      0x0       , // CB_BLEND6_CONTROL
-      0x0       , // CB_BLEND7_CONTROL
-      0x0       , // CB_MRT0_EPITCH
-      0x0       , // CB_MRT1_EPITCH
-      0x0       , // CB_MRT2_EPITCH
-      0x0       , // CB_MRT3_EPITCH
-      0x0       , // CB_MRT4_EPITCH
-      0x0       , // CB_MRT5_EPITCH
-      0x0       , // CB_MRT6_EPITCH
-      0x0         // CB_MRT7_EPITCH
+      0x0, // SX_PS_DOWNCONVERT
+      0x0, // SX_BLEND_OPT_EPSILON
+      0x0, // SX_BLEND_OPT_CONTROL
+      0x0, // SX_MRT0_BLEND_OPT
+      0x0, // SX_MRT1_BLEND_OPT
+      0x0, // SX_MRT2_BLEND_OPT
+      0x0, // SX_MRT3_BLEND_OPT
+      0x0, // SX_MRT4_BLEND_OPT
+      0x0, // SX_MRT5_BLEND_OPT
+      0x0, // SX_MRT6_BLEND_OPT
+      0x0, // SX_MRT7_BLEND_OPT
+      0x0, // CB_BLEND0_CONTROL
+      0x0, // CB_BLEND1_CONTROL
+      0x0, // CB_BLEND2_CONTROL
+      0x0, // CB_BLEND3_CONTROL
+      0x0, // CB_BLEND4_CONTROL
+      0x0, // CB_BLEND5_CONTROL
+      0x0, // CB_BLEND6_CONTROL
+      0x0, // CB_BLEND7_CONTROL
+      0x0, // CB_MRT0_EPITCH
+      0x0, // CB_MRT1_EPITCH
+      0x0, // CB_MRT2_EPITCH
+      0x0, // CB_MRT3_EPITCH
+      0x0, // CB_MRT4_EPITCH
+      0x0, // CB_MRT5_EPITCH
+      0x0, // CB_MRT6_EPITCH
+      0x0  // CB_MRT7_EPITCH
    };
    static const uint32_t DbDepthControlGfx9[] = {
-      0x0       , // DB_DEPTH_CONTROL
-      0x0       , // DB_EQAA
-      0x0       , // CB_COLOR_CONTROL
-      0x0       , // DB_SHADER_CONTROL
-      0x90000   , // PA_CL_CLIP_CNTL
-      0x4       , // PA_SU_SC_MODE_CNTL
-      0x0       , // PA_CL_VTE_CNTL
-      0x0       , // PA_CL_VS_OUT_CNTL
-      0x0         // PA_CL_NANINF_CNTL
+      0x0,     // DB_DEPTH_CONTROL
+      0x0,     // DB_EQAA
+      0x0,     // CB_COLOR_CONTROL
+      0x0,     // DB_SHADER_CONTROL
+      0x90000, // PA_CL_CLIP_CNTL
+      0x4,     // PA_SU_SC_MODE_CNTL
+      0x0,     // PA_CL_VTE_CNTL
+      0x0,     // PA_CL_VS_OUT_CNTL
+      0x0      // PA_CL_NANINF_CNTL
    };
    static const uint32_t PaSuPrimFilterCntlGfx9[] = {
-      0x0       , // PA_SU_PRIM_FILTER_CNTL
-      0x0       , // PA_SU_SMALL_PRIM_FILTER_CNTL
-      0x0       , // PA_CL_OBJPRIM_ID_CNTL
-      0x0       , // PA_CL_NGG_CNTL
-      0x0       , // PA_SU_OVER_RASTERIZATION_CNTL
-      0x0         // PA_STEREO_CNTL
+      0x0, // PA_SU_PRIM_FILTER_CNTL
+      0x0, // PA_SU_SMALL_PRIM_FILTER_CNTL
+      0x0, // PA_CL_OBJPRIM_ID_CNTL
+      0x0, // PA_CL_NGG_CNTL
+      0x0, // PA_SU_OVER_RASTERIZATION_CNTL
+      0x0  // PA_STEREO_CNTL
    };
    static const uint32_t PaSuPointSizeGfx9[] = {
-      0x0       , // PA_SU_POINT_SIZE
-      0x0       , // PA_SU_POINT_MINMAX
-      0x0       , // PA_SU_LINE_CNTL
-      0x0         // PA_SC_LINE_STIPPLE
+      0x0, // PA_SU_POINT_SIZE
+      0x0, // PA_SU_POINT_MINMAX
+      0x0, // PA_SU_LINE_CNTL
+      0x0  // PA_SC_LINE_STIPPLE
    };
    static const uint32_t VgtHosMaxTessLevelGfx9[] = {
-      0x0       , // VGT_HOS_MAX_TESS_LEVEL
-      0x0         // VGT_HOS_MIN_TESS_LEVEL
+      0x0, // VGT_HOS_MAX_TESS_LEVEL
+      0x0  // VGT_HOS_MIN_TESS_LEVEL
    };
    static const uint32_t VgtGsModeGfx9[] = {
-      0x0       , // VGT_GS_MODE
-      0x0       , // VGT_GS_ONCHIP_CNTL
-      0x0       , // PA_SC_MODE_CNTL_0
-      0x0       , // PA_SC_MODE_CNTL_1
-      0x0       , // VGT_ENHANCE
-      0x100     , // VGT_GS_PER_ES
-      0x80      , // VGT_ES_PER_GS
-      0x2       , // VGT_GS_PER_VS
-      0x0       , // VGT_GSVS_RING_OFFSET_1
-      0x0       , // VGT_GSVS_RING_OFFSET_2
-      0x0       , // VGT_GSVS_RING_OFFSET_3
-      0x0         // VGT_GS_OUT_PRIM_TYPE
+      0x0,   // VGT_GS_MODE
+      0x0,   // VGT_GS_ONCHIP_CNTL
+      0x0,   // PA_SC_MODE_CNTL_0
+      0x0,   // PA_SC_MODE_CNTL_1
+      0x0,   // VGT_ENHANCE
+      0x100, // VGT_GS_PER_ES
+      0x80 // VGT_ES_PER_GS
+      0x2,   // VGT_GS_PER_VS
+      0x0,   // VGT_GSVS_RING_OFFSET_1
+      0x0,   // VGT_GSVS_RING_OFFSET_2
+      0x0,   // VGT_GSVS_RING_OFFSET_3
+      0x0    // VGT_GS_OUT_PRIM_TYPE
    };
    static const uint32_t VgtPrimitiveidEnGfx9[] = {
-      0x0         // VGT_PRIMITIVEID_EN
+      0x0 // VGT_PRIMITIVEID_EN
    };
    static const uint32_t VgtPrimitiveidResetGfx9[] = {
-      0x0         // VGT_PRIMITIVEID_RESET
+      0x0 // VGT_PRIMITIVEID_RESET
    };
    static const uint32_t VgtGsMaxPrimsPerSubgroupGfx9[] = {
-      0x0       , // VGT_GS_MAX_PRIMS_PER_SUBGROUP
-      0x0       , // VGT_DRAW_PAYLOAD_CNTL
-      0x0       , //
-      0x0       , // VGT_INSTANCE_STEP_RATE_0
-      0x0       , // VGT_INSTANCE_STEP_RATE_1
-      0x0       , //
-      0x0       , // VGT_ESGS_RING_ITEMSIZE
-      0x0       , // VGT_GSVS_RING_ITEMSIZE
-      0x0       , // VGT_REUSE_OFF
-      0x0       , // VGT_VTX_CNT_EN
-      0x0       , // DB_HTILE_SURFACE
-      0x0       , // DB_SRESULTS_COMPARE_STATE0
-      0x0       , // DB_SRESULTS_COMPARE_STATE1
-      0x0       , // DB_PRELOAD_CONTROL
-      0x0       , //
-      0x0       , // VGT_STRMOUT_BUFFER_SIZE_0
-      0x0         // VGT_STRMOUT_VTX_STRIDE_0
+      0x0, // VGT_GS_MAX_PRIMS_PER_SUBGROUP
+      0x0, // VGT_DRAW_PAYLOAD_CNTL
+      0x0, //
+      0x0, // VGT_INSTANCE_STEP_RATE_0
+      0x0, // VGT_INSTANCE_STEP_RATE_1
+      0x0, //
+      0x0, // VGT_ESGS_RING_ITEMSIZE
+      0x0, // VGT_GSVS_RING_ITEMSIZE
+      0x0, // VGT_REUSE_OFF
+      0x0, // VGT_VTX_CNT_EN
+      0x0, // DB_HTILE_SURFACE
+      0x0, // DB_SRESULTS_COMPARE_STATE0
+      0x0, // DB_SRESULTS_COMPARE_STATE1
+      0x0, // DB_PRELOAD_CONTROL
+      0x0, //
+      0x0, // VGT_STRMOUT_BUFFER_SIZE_0
+      0x0  // VGT_STRMOUT_VTX_STRIDE_0
    };
    static const uint32_t VgtStrmoutBufferSize1Gfx9[] = {
-      0x0       , // VGT_STRMOUT_BUFFER_SIZE_1
-      0x0         // VGT_STRMOUT_VTX_STRIDE_1
+      0x0, // VGT_STRMOUT_BUFFER_SIZE_1
+      0x0  // VGT_STRMOUT_VTX_STRIDE_1
    };
    static const uint32_t VgtStrmoutBufferSize2Gfx9[] = {
-      0x0       , // VGT_STRMOUT_BUFFER_SIZE_2
-      0x0         // VGT_STRMOUT_VTX_STRIDE_2
+      0x0, // VGT_STRMOUT_BUFFER_SIZE_2
+      0x0  // VGT_STRMOUT_VTX_STRIDE_2
    };
    static const uint32_t VgtStrmoutBufferSize3Gfx9[] = {
-      0x0       , // VGT_STRMOUT_BUFFER_SIZE_3
-      0x0         // VGT_STRMOUT_VTX_STRIDE_3
+      0x0, // VGT_STRMOUT_BUFFER_SIZE_3
+      0x0  // VGT_STRMOUT_VTX_STRIDE_3
    };
    static const uint32_t VgtStrmoutDrawOpaqueOffsetGfx9[] = {
-      0x0       , // VGT_STRMOUT_DRAW_OPAQUE_OFFSET
-      0x0       , // VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE
-      0x0         // VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE
+      0x0, // VGT_STRMOUT_DRAW_OPAQUE_OFFSET
+      0x0, // VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE
+      0x0  // VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE
    };
    static const uint32_t VgtGsMaxVertOutGfx9[] = {
-      0x0       , // VGT_GS_MAX_VERT_OUT
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , // VGT_TESS_DISTRIBUTION
-      0x0       , // VGT_SHADER_STAGES_EN
-      0x0       , // VGT_LS_HS_CONFIG
-      0x0       , // VGT_GS_VERT_ITEMSIZE
-      0x0       , // VGT_GS_VERT_ITEMSIZE_1
-      0x0       , // VGT_GS_VERT_ITEMSIZE_2
-      0x0       , // VGT_GS_VERT_ITEMSIZE_3
-      0x0       , // VGT_TF_PARAM
-      0x0       , // DB_ALPHA_TO_MASK
-      0x0       , // VGT_DISPATCH_DRAW_INDEX
-      0x0       , // PA_SU_POLY_OFFSET_DB_FMT_CNTL
-      0x0       , // PA_SU_POLY_OFFSET_CLAMP
-      0x0       , // PA_SU_POLY_OFFSET_FRONT_SCALE
-      0x0       , // PA_SU_POLY_OFFSET_FRONT_OFFSET
-      0x0       , // PA_SU_POLY_OFFSET_BACK_SCALE
-      0x0       , // PA_SU_POLY_OFFSET_BACK_OFFSET
-      0x0       , // VGT_GS_INSTANCE_CNT
-      0x0       , // VGT_STRMOUT_CONFIG
-      0x0         // VGT_STRMOUT_BUFFER_CONFIG
+      0x0, // VGT_GS_MAX_VERT_OUT
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, // VGT_TESS_DISTRIBUTION
+      0x0, // VGT_SHADER_STAGES_EN
+      0x0, // VGT_LS_HS_CONFIG
+      0x0, // VGT_GS_VERT_ITEMSIZE
+      0x0, // VGT_GS_VERT_ITEMSIZE_1
+      0x0, // VGT_GS_VERT_ITEMSIZE_2
+      0x0, // VGT_GS_VERT_ITEMSIZE_3
+      0x0, // VGT_TF_PARAM
+      0x0, // DB_ALPHA_TO_MASK
+      0x0, // VGT_DISPATCH_DRAW_INDEX
+      0x0, // PA_SU_POLY_OFFSET_DB_FMT_CNTL
+      0x0, // PA_SU_POLY_OFFSET_CLAMP
+      0x0, // PA_SU_POLY_OFFSET_FRONT_SCALE
+      0x0, // PA_SU_POLY_OFFSET_FRONT_OFFSET
+      0x0, // PA_SU_POLY_OFFSET_BACK_SCALE
+      0x0, // PA_SU_POLY_OFFSET_BACK_OFFSET
+      0x0, // VGT_GS_INSTANCE_CNT
+      0x0, // VGT_STRMOUT_CONFIG
+      0x0  // VGT_STRMOUT_BUFFER_CONFIG
    };
    static const uint32_t PaScCentroidPriority0Gfx9[] = {
-      0x0       , // PA_SC_CENTROID_PRIORITY_0
-      0x0       , // PA_SC_CENTROID_PRIORITY_1
-      0x1000    , // PA_SC_LINE_CNTL
-      0x0       , // PA_SC_AA_CONFIG
-      0x5       , // PA_SU_VTX_CNTL
+      0x0,        // PA_SC_CENTROID_PRIORITY_0
+      0x0,        // PA_SC_CENTROID_PRIORITY_1
+      0x1000,     // PA_SC_LINE_CNTL
+      0x0,        // PA_SC_AA_CONFIG
+      0x5,        // PA_SU_VTX_CNTL
       0x3f800000, // PA_CL_GB_VERT_CLIP_ADJ
       0x3f800000, // PA_CL_GB_VERT_DISC_ADJ
       0x3f800000, // PA_CL_GB_HORZ_CLIP_ADJ
       0x3f800000, // PA_CL_GB_HORZ_DISC_ADJ
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3
       0xffffffff, // PA_SC_AA_MASK_X0Y0_X1Y0
       0xffffffff, // PA_SC_AA_MASK_X0Y1_X1Y1
-      0x0       , // PA_SC_SHADER_CONTROL
-      0x3       , // PA_SC_BINNER_CNTL_0
-      0x0       , // PA_SC_BINNER_CNTL_1
-      0x100000  , // PA_SC_CONSERVATIVE_RASTERIZATION_CNTL
-      0x0       , // PA_SC_NGG_MODE_CNTL
-      0x0       , //
-      0x1e      , // VGT_VERTEX_REUSE_BLOCK_CNTL
-      0x20      , // VGT_OUT_DEALLOC_CNTL
-      0x0       , // CB_COLOR0_BASE
-      0x0       , // CB_COLOR0_BASE_EXT
-      0x0       , // CB_COLOR0_ATTRIB2
-      0x0       , // CB_COLOR0_VIEW
-      0x0       , // CB_COLOR0_INFO
-      0x0       , // CB_COLOR0_ATTRIB
-      0x0       , // CB_COLOR0_DCC_CONTROL
-      0x0       , // CB_COLOR0_CMASK
-      0x0       , // CB_COLOR0_CMASK_BASE_EXT
-      0x0       , // CB_COLOR0_FMASK
-      0x0       , // CB_COLOR0_FMASK_BASE_EXT
-      0x0       , // CB_COLOR0_CLEAR_WORD0
-      0x0       , // CB_COLOR0_CLEAR_WORD1
-      0x0       , // CB_COLOR0_DCC_BASE
-      0x0       , // CB_COLOR0_DCC_BASE_EXT
-      0x0       , // CB_COLOR1_BASE
-      0x0       , // CB_COLOR1_BASE_EXT
-      0x0       , // CB_COLOR1_ATTRIB2
-      0x0       , // CB_COLOR1_VIEW
-      0x0       , // CB_COLOR1_INFO
-      0x0       , // CB_COLOR1_ATTRIB
-      0x0       , // CB_COLOR1_DCC_CONTROL
-      0x0       , // CB_COLOR1_CMASK
-      0x0       , // CB_COLOR1_CMASK_BASE_EXT
-      0x0       , // CB_COLOR1_FMASK
-      0x0       , // CB_COLOR1_FMASK_BASE_EXT
-      0x0       , // CB_COLOR1_CLEAR_WORD0
-      0x0       , // CB_COLOR1_CLEAR_WORD1
-      0x0       , // CB_COLOR1_DCC_BASE
-      0x0       , // CB_COLOR1_DCC_BASE_EXT
-      0x0       , // CB_COLOR2_BASE
-      0x0       , // CB_COLOR2_BASE_EXT
-      0x0       , // CB_COLOR2_ATTRIB2
-      0x0       , // CB_COLOR2_VIEW
-      0x0       , // CB_COLOR2_INFO
-      0x0       , // CB_COLOR2_ATTRIB
-      0x0       , // CB_COLOR2_DCC_CONTROL
-      0x0       , // CB_COLOR2_CMASK
-      0x0       , // CB_COLOR2_CMASK_BASE_EXT
-      0x0       , // CB_COLOR2_FMASK
-      0x0       , // CB_COLOR2_FMASK_BASE_EXT
-      0x0       , // CB_COLOR2_CLEAR_WORD0
-      0x0       , // CB_COLOR2_CLEAR_WORD1
-      0x0       , // CB_COLOR2_DCC_BASE
-      0x0       , // CB_COLOR2_DCC_BASE_EXT
-      0x0       , // CB_COLOR3_BASE
-      0x0       , // CB_COLOR3_BASE_EXT
-      0x0       , // CB_COLOR3_ATTRIB2
-      0x0       , // CB_COLOR3_VIEW
-      0x0       , // CB_COLOR3_INFO
-      0x0       , // CB_COLOR3_ATTRIB
-      0x0       , // CB_COLOR3_DCC_CONTROL
-      0x0       , // CB_COLOR3_CMASK
-      0x0       , // CB_COLOR3_CMASK_BASE_EXT
-      0x0       , // CB_COLOR3_FMASK
-      0x0       , // CB_COLOR3_FMASK_BASE_EXT
-      0x0       , // CB_COLOR3_CLEAR_WORD0
-      0x0       , // CB_COLOR3_CLEAR_WORD1
-      0x0       , // CB_COLOR3_DCC_BASE
-      0x0       , // CB_COLOR3_DCC_BASE_EXT
-      0x0       , // CB_COLOR4_BASE
-      0x0       , // CB_COLOR4_BASE_EXT
-      0x0       , // CB_COLOR4_ATTRIB2
-      0x0       , // CB_COLOR4_VIEW
-      0x0       , // CB_COLOR4_INFO
-      0x0       , // CB_COLOR4_ATTRIB
-      0x0       , // CB_COLOR4_DCC_CONTROL
-      0x0       , // CB_COLOR4_CMASK
-      0x0       , // CB_COLOR4_CMASK_BASE_EXT
-      0x0       , // CB_COLOR4_FMASK
-      0x0       , // CB_COLOR4_FMASK_BASE_EXT
-      0x0       , // CB_COLOR4_CLEAR_WORD0
-      0x0       , // CB_COLOR4_CLEAR_WORD1
-      0x0       , // CB_COLOR4_DCC_BASE
-      0x0       , // CB_COLOR4_DCC_BASE_EXT
-      0x0       , // CB_COLOR5_BASE
-      0x0       , // CB_COLOR5_BASE_EXT
-      0x0       , // CB_COLOR5_ATTRIB2
-      0x0       , // CB_COLOR5_VIEW
-      0x0       , // CB_COLOR5_INFO
-      0x0       , // CB_COLOR5_ATTRIB
-      0x0       , // CB_COLOR5_DCC_CONTROL
-      0x0       , // CB_COLOR5_CMASK
-      0x0       , // CB_COLOR5_CMASK_BASE_EXT
-      0x0       , // CB_COLOR5_FMASK
-      0x0       , // CB_COLOR5_FMASK_BASE_EXT
-      0x0       , // CB_COLOR5_CLEAR_WORD0
-      0x0       , // CB_COLOR5_CLEAR_WORD1
-      0x0       , // CB_COLOR5_DCC_BASE
-      0x0       , // CB_COLOR5_DCC_BASE_EXT
-      0x0       , // CB_COLOR6_BASE
-      0x0       , // CB_COLOR6_BASE_EXT
-      0x0       , // CB_COLOR6_ATTRIB2
-      0x0       , // CB_COLOR6_VIEW
-      0x0       , // CB_COLOR6_INFO
-      0x0       , // CB_COLOR6_ATTRIB
-      0x0       , // CB_COLOR6_DCC_CONTROL
-      0x0       , // CB_COLOR6_CMASK
-      0x0       , // CB_COLOR6_CMASK_BASE_EXT
-      0x0       , // CB_COLOR6_FMASK
-      0x0       , // CB_COLOR6_FMASK_BASE_EXT
-      0x0       , // CB_COLOR6_CLEAR_WORD0
-      0x0       , // CB_COLOR6_CLEAR_WORD1
-      0x0       , // CB_COLOR6_DCC_BASE
-      0x0       , // CB_COLOR6_DCC_BASE_EXT
-      0x0       , // CB_COLOR7_BASE
-      0x0       , // CB_COLOR7_BASE_EXT
-      0x0       , // CB_COLOR7_ATTRIB2
-      0x0       , // CB_COLOR7_VIEW
-      0x0       , // CB_COLOR7_INFO
-      0x0       , // CB_COLOR7_ATTRIB
-      0x0       , // CB_COLOR7_DCC_CONTROL
-      0x0       , // CB_COLOR7_CMASK
-      0x0       , // CB_COLOR7_CMASK_BASE_EXT
-      0x0       , // CB_COLOR7_FMASK
-      0x0       , // CB_COLOR7_FMASK_BASE_EXT
-      0x0       , // CB_COLOR7_CLEAR_WORD0
-      0x0       , // CB_COLOR7_CLEAR_WORD1
-      0x0       , // CB_COLOR7_DCC_BASE
+      0x0,        // PA_SC_SHADER_CONTROL
+      0x3,        // PA_SC_BINNER_CNTL_0
+      0x0,        // PA_SC_BINNER_CNTL_1
+      0x100000,   // PA_SC_CONSERVATIVE_RASTERIZATION_CNTL
+      0x0,        // PA_SC_NGG_MODE_CNTL
+      0x0,        //
+      0x1e,       // VGT_VERTEX_REUSE_BLOCK_CNTL
+      0x20,       // VGT_OUT_DEALLOC_CNTL
+      0x0,        // CB_COLOR0_BASE
+      0x0,        // CB_COLOR0_BASE_EXT
+      0x0,        // CB_COLOR0_ATTRIB2
+      0x0,        // CB_COLOR0_VIEW
+      0x0,        // CB_COLOR0_INFO
+      0x0,        // CB_COLOR0_ATTRIB
+      0x0,        // CB_COLOR0_DCC_CONTROL
+      0x0,        // CB_COLOR0_CMASK
+      0x0,        // CB_COLOR0_CMASK_BASE_EXT
+      0x0,        // CB_COLOR0_FMASK
+      0x0,        // CB_COLOR0_FMASK_BASE_EXT
+      0x0,        // CB_COLOR0_CLEAR_WORD0
+      0x0,        // CB_COLOR0_CLEAR_WORD1
+      0x0,        // CB_COLOR0_DCC_BASE
+      0x0,        // CB_COLOR0_DCC_BASE_EXT
+      0x0,        // CB_COLOR1_BASE
+      0x0,        // CB_COLOR1_BASE_EXT
+      0x0,        // CB_COLOR1_ATTRIB2
+      0x0,        // CB_COLOR1_VIEW
+      0x0,        // CB_COLOR1_INFO
+      0x0,        // CB_COLOR1_ATTRIB
+      0x0,        // CB_COLOR1_DCC_CONTROL
+      0x0,        // CB_COLOR1_CMASK
+      0x0,        // CB_COLOR1_CMASK_BASE_EXT
+      0x0,        // CB_COLOR1_FMASK
+      0x0,        // CB_COLOR1_FMASK_BASE_EXT
+      0x0,        // CB_COLOR1_CLEAR_WORD0
+      0x0,        // CB_COLOR1_CLEAR_WORD1
+      0x0,        // CB_COLOR1_DCC_BASE
+      0x0,        // CB_COLOR1_DCC_BASE_EXT
+      0x0,        // CB_COLOR2_BASE
+      0x0,        // CB_COLOR2_BASE_EXT
+      0x0,        // CB_COLOR2_ATTRIB2
+      0x0,        // CB_COLOR2_VIEW
+      0x0,        // CB_COLOR2_INFO
+      0x0,        // CB_COLOR2_ATTRIB
+      0x0,        // CB_COLOR2_DCC_CONTROL
+      0x0,        // CB_COLOR2_CMASK
+      0x0,        // CB_COLOR2_CMASK_BASE_EXT
+      0x0,        // CB_COLOR2_FMASK
+      0x0,        // CB_COLOR2_FMASK_BASE_EXT
+      0x0,        // CB_COLOR2_CLEAR_WORD0
+      0x0,        // CB_COLOR2_CLEAR_WORD1
+      0x0,        // CB_COLOR2_DCC_BASE
+      0x0,        // CB_COLOR2_DCC_BASE_EXT
+      0x0,        // CB_COLOR3_BASE
+      0x0,        // CB_COLOR3_BASE_EXT
+      0x0,        // CB_COLOR3_ATTRIB2
+      0x0,        // CB_COLOR3_VIEW
+      0x0,        // CB_COLOR3_INFO
+      0x0,        // CB_COLOR3_ATTRIB
+      0x0,        // CB_COLOR3_DCC_CONTROL
+      0x0,        // CB_COLOR3_CMASK
+      0x0,        // CB_COLOR3_CMASK_BASE_EXT
+      0x0,        // CB_COLOR3_FMASK
+      0x0,        // CB_COLOR3_FMASK_BASE_EXT
+      0x0,        // CB_COLOR3_CLEAR_WORD0
+      0x0,        // CB_COLOR3_CLEAR_WORD1
+      0x0,        // CB_COLOR3_DCC_BASE
+      0x0,        // CB_COLOR3_DCC_BASE_EXT
+      0x0,        // CB_COLOR4_BASE
+      0x0,        // CB_COLOR4_BASE_EXT
+      0x0,        // CB_COLOR4_ATTRIB2
+      0x0,        // CB_COLOR4_VIEW
+      0x0,        // CB_COLOR4_INFO
+      0x0,        // CB_COLOR4_ATTRIB
+      0x0,        // CB_COLOR4_DCC_CONTROL
+      0x0,        // CB_COLOR4_CMASK
+      0x0,        // CB_COLOR4_CMASK_BASE_EXT
+      0x0,        // CB_COLOR4_FMASK
+      0x0,        // CB_COLOR4_FMASK_BASE_EXT
+      0x0,        // CB_COLOR4_CLEAR_WORD0
+      0x0,        // CB_COLOR4_CLEAR_WORD1
+      0x0,        // CB_COLOR4_DCC_BASE
+      0x0,        // CB_COLOR4_DCC_BASE_EXT
+      0x0,        // CB_COLOR5_BASE
+      0x0,        // CB_COLOR5_BASE_EXT
+      0x0,        // CB_COLOR5_ATTRIB2
+      0x0,        // CB_COLOR5_VIEW
+      0x0,        // CB_COLOR5_INFO
+      0x0,        // CB_COLOR5_ATTRIB
+      0x0,        // CB_COLOR5_DCC_CONTROL
+      0x0,        // CB_COLOR5_CMASK
+      0x0,        // CB_COLOR5_CMASK_BASE_EXT
+      0x0,        // CB_COLOR5_FMASK
+      0x0,        // CB_COLOR5_FMASK_BASE_EXT
+      0x0,        // CB_COLOR5_CLEAR_WORD0
+      0x0,        // CB_COLOR5_CLEAR_WORD1
+      0x0,        // CB_COLOR5_DCC_BASE
+      0x0,        // CB_COLOR5_DCC_BASE_EXT
+      0x0,        // CB_COLOR6_BASE
+      0x0,        // CB_COLOR6_BASE_EXT
+      0x0,        // CB_COLOR6_ATTRIB2
+      0x0,        // CB_COLOR6_VIEW
+      0x0,        // CB_COLOR6_INFO
+      0x0,        // CB_COLOR6_ATTRIB
+      0x0,        // CB_COLOR6_DCC_CONTROL
+      0x0,        // CB_COLOR6_CMASK
+      0x0,        // CB_COLOR6_CMASK_BASE_EXT
+      0x0,        // CB_COLOR6_FMASK
+      0x0,        // CB_COLOR6_FMASK_BASE_EXT
+      0x0,        // CB_COLOR6_CLEAR_WORD0
+      0x0,        // CB_COLOR6_CLEAR_WORD1
+      0x0,        // CB_COLOR6_DCC_BASE
+      0x0,        // CB_COLOR6_DCC_BASE_EXT
+      0x0,        // CB_COLOR7_BASE
+      0x0,        // CB_COLOR7_BASE_EXT
+      0x0,        // CB_COLOR7_ATTRIB2
+      0x0,        // CB_COLOR7_VIEW
+      0x0,        // CB_COLOR7_INFO
+      0x0,        // CB_COLOR7_ATTRIB
+      0x0,        // CB_COLOR7_DCC_CONTROL
+      0x0,        // CB_COLOR7_CMASK
+      0x0,        // CB_COLOR7_CMASK_BASE_EXT
+      0x0,        // CB_COLOR7_FMASK
+      0x0,        // CB_COLOR7_FMASK_BASE_EXT
+      0x0,        // CB_COLOR7_CLEAR_WORD0
+      0x0,        // CB_COLOR7_CLEAR_WORD1
+      0x0,        // CB_COLOR7_DCC_BASE
       0x0         // CB_COLOR7_DCC_BASE_EXT
    };
 
@@ -1500,7 +1506,8 @@ static void gfx9_emulate_clear_state(struct radeon_cmdbuf *cs,
 
    set_context_reg_seq_array(cs, R_028000_DB_RENDER_CONTROL, SET(DbRenderControlGfx9));
    set_context_reg_seq_array(cs, R_0281E8_COHER_DEST_BASE_HI_0, SET(CoherDestBaseHi0Gfx9));
-   set_context_reg_seq_array(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, SET(VgtMultiPrimIbResetIndxGfx9));
+   set_context_reg_seq_array(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
+                             SET(VgtMultiPrimIbResetIndxGfx9));
    set_context_reg_seq_array(cs, R_028414_CB_BLEND_RED, SET(CbBlendRedGfx9));
    set_context_reg_seq_array(cs, R_028644_SPI_PS_INPUT_CNTL_0, SET(SpiPsInputCntl0Gfx9));
    set_context_reg_seq_array(cs, R_028754_SX_PS_DOWNCONVERT, SET(SxPsDownconvertGfx9));
@@ -1511,13 +1518,19 @@ static void gfx9_emulate_clear_state(struct radeon_cmdbuf *cs,
    set_context_reg_seq_array(cs, R_028A40_VGT_GS_MODE, SET(VgtGsModeGfx9));
    set_context_reg_seq_array(cs, R_028A84_VGT_PRIMITIVEID_EN, SET(VgtPrimitiveidEnGfx9));
    set_context_reg_seq_array(cs, R_028A8C_VGT_PRIMITIVEID_RESET, SET(VgtPrimitiveidResetGfx9));
-   set_context_reg_seq_array(cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, SET(VgtGsMaxPrimsPerSubgroupGfx9));
-   set_context_reg_seq_array(cs, R_028AE0_VGT_STRMOUT_BUFFER_SIZE_1, SET(VgtStrmoutBufferSize1Gfx9));
-   set_context_reg_seq_array(cs, R_028AF0_VGT_STRMOUT_BUFFER_SIZE_2, SET(VgtStrmoutBufferSize2Gfx9));
-   set_context_reg_seq_array(cs, R_028B00_VGT_STRMOUT_BUFFER_SIZE_3, SET(VgtStrmoutBufferSize3Gfx9));
-   set_context_reg_seq_array(cs, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, SET(VgtStrmoutDrawOpaqueOffsetGfx9));
+   set_context_reg_seq_array(cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+                             SET(VgtGsMaxPrimsPerSubgroupGfx9));
+   set_context_reg_seq_array(cs, R_028AE0_VGT_STRMOUT_BUFFER_SIZE_1,
+                             SET(VgtStrmoutBufferSize1Gfx9));
+   set_context_reg_seq_array(cs, R_028AF0_VGT_STRMOUT_BUFFER_SIZE_2,
+                             SET(VgtStrmoutBufferSize2Gfx9));
+   set_context_reg_seq_array(cs, R_028B00_VGT_STRMOUT_BUFFER_SIZE_3,
+                             SET(VgtStrmoutBufferSize3Gfx9));
+   set_context_reg_seq_array(cs, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET,
+                             SET(VgtStrmoutDrawOpaqueOffsetGfx9));
    set_context_reg_seq_array(cs, R_028B38_VGT_GS_MAX_VERT_OUT, SET(VgtGsMaxVertOutGfx9));
-   set_context_reg_seq_array(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, SET(PaScCentroidPriority0Gfx9));
+   set_context_reg_seq_array(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0,
+                             SET(PaScCentroidPriority0Gfx9));
 }
 
 /**
@@ -1529,68 +1542,68 @@ static void gfx10_emulate_clear_state(struct radeon_cmdbuf *cs, unsigned num_reg
                                       set_context_reg_seq_array_fn set_context_reg_seq_array)
 {
    static const uint32_t DbRenderControlNv10[] = {
-      0x0       , // DB_RENDER_CONTROL
-      0x0       , // DB_COUNT_CONTROL
-      0x0       , // DB_DEPTH_VIEW
-      0x0       , // DB_RENDER_OVERRIDE
-      0x0       , // DB_RENDER_OVERRIDE2
-      0x0       , // DB_HTILE_DATA_BASE
-      0x0       , //
-      0x0       , // DB_DEPTH_SIZE_XY
-      0x0       , // DB_DEPTH_BOUNDS_MIN
-      0x0       , // DB_DEPTH_BOUNDS_MAX
-      0x0       , // DB_STENCIL_CLEAR
-      0x0       , // DB_DEPTH_CLEAR
-      0x0       , // PA_SC_SCREEN_SCISSOR_TL
+      0x0,        // DB_RENDER_CONTROL
+      0x0,        // DB_COUNT_CONTROL
+      0x0,        // DB_DEPTH_VIEW
+      0x0,        // DB_RENDER_OVERRIDE
+      0x0,        // DB_RENDER_OVERRIDE2
+      0x0,        // DB_HTILE_DATA_BASE
+      0x0,        //
+      0x0,        // DB_DEPTH_SIZE_XY
+      0x0,        // DB_DEPTH_BOUNDS_MIN
+      0x0,        // DB_DEPTH_BOUNDS_MAX
+      0x0,        // DB_STENCIL_CLEAR
+      0x0,        // DB_DEPTH_CLEAR
+      0x0,        // PA_SC_SCREEN_SCISSOR_TL
       0x40004000, // PA_SC_SCREEN_SCISSOR_BR
-      0x0       , // DB_DFSM_CONTROL
-      0x0       , // DB_RESERVED_REG_2
-      0x0       , // DB_Z_INFO
-      0x0       , // DB_STENCIL_INFO
-      0x0       , // DB_Z_READ_BASE
-      0x0       , // DB_STENCIL_READ_BASE
-      0x0       , // DB_Z_WRITE_BASE
-      0x0       , // DB_STENCIL_WRITE_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , // DB_Z_READ_BASE_HI
-      0x0       , // DB_STENCIL_READ_BASE_HI
-      0x0       , // DB_Z_WRITE_BASE_HI
-      0x0       , // DB_STENCIL_WRITE_BASE_HI
-      0x0       , // DB_HTILE_DATA_BASE_HI
-      0x0       , // DB_RMI_L2_CACHE_CONTROL
-      0x0       , // TA_BC_BASE_ADDR
+      0x0,        // DB_DFSM_CONTROL
+      0x0,        // DB_RESERVED_REG_2
+      0x0,        // DB_Z_INFO
+      0x0,        // DB_STENCIL_INFO
+      0x0,        // DB_Z_READ_BASE
+      0x0,        // DB_STENCIL_READ_BASE
+      0x0,        // DB_Z_WRITE_BASE
+      0x0,        // DB_STENCIL_WRITE_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        //
+      0x0,        //
+      0x0,        // DB_Z_READ_BASE_HI
+      0x0,        // DB_STENCIL_READ_BASE_HI
+      0x0,        // DB_Z_WRITE_BASE_HI
+      0x0,        // DB_STENCIL_WRITE_BASE_HI
+      0x0,        // DB_HTILE_DATA_BASE_HI
+      0x0,        // DB_RMI_L2_CACHE_CONTROL
+      0x0,        // TA_BC_BASE_ADDR
       0x0         // TA_BC_BASE_ADDR_HI
    };
    static const uint32_t CoherDestBaseHi0Nv10[] = {
-      0x0       , // COHER_DEST_BASE_HI_0
-      0x0       , // COHER_DEST_BASE_HI_1
-      0x0       , // COHER_DEST_BASE_HI_2
-      0x0       , // COHER_DEST_BASE_HI_3
-      0x0       , // COHER_DEST_BASE_2
-      0x0       , // COHER_DEST_BASE_3
-      0x0       , // PA_SC_WINDOW_OFFSET
+      0x0,        // COHER_DEST_BASE_HI_0
+      0x0,        // COHER_DEST_BASE_HI_1
+      0x0,        // COHER_DEST_BASE_HI_2
+      0x0,        // COHER_DEST_BASE_HI_3
+      0x0,        // COHER_DEST_BASE_2
+      0x0,        // COHER_DEST_BASE_3
+      0x0,        // PA_SC_WINDOW_OFFSET
       0x80000000, // PA_SC_WINDOW_SCISSOR_TL
       0x40004000, // PA_SC_WINDOW_SCISSOR_BR
-      0xffff    , // PA_SC_CLIPRECT_RULE
-      0x0       , // PA_SC_CLIPRECT_0_TL
+      0xffff,     // PA_SC_CLIPRECT_RULE
+      0x0,        // PA_SC_CLIPRECT_0_TL
       0x40004000, // PA_SC_CLIPRECT_0_BR
-      0x0       , // PA_SC_CLIPRECT_1_TL
+      0x0,        // PA_SC_CLIPRECT_1_TL
       0x40004000, // PA_SC_CLIPRECT_1_BR
-      0x0       , // PA_SC_CLIPRECT_2_TL
+      0x0,        // PA_SC_CLIPRECT_2_TL
       0x40004000, // PA_SC_CLIPRECT_2_BR
-      0x0       , // PA_SC_CLIPRECT_3_TL
+      0x0,        // PA_SC_CLIPRECT_3_TL
       0x40004000, // PA_SC_CLIPRECT_3_BR
       0xaa99aaaa, // PA_SC_EDGERULE
-      0x0       , // PA_SU_HARDWARE_SCREEN_OFFSET
+      0x0,        // PA_SU_HARDWARE_SCREEN_OFFSET
       0xffffffff, // CB_TARGET_MASK
       0xffffffff, // CB_SHADER_MASK
       0x80000000, // PA_SC_GENERIC_SCISSOR_TL
       0x40004000, // PA_SC_GENERIC_SCISSOR_BR
-      0x0       , // COHER_DEST_BASE_0
-      0x0       , // COHER_DEST_BASE_1
+      0x0,        // COHER_DEST_BASE_0
+      0x0,        // COHER_DEST_BASE_1
       0x80000000, // PA_SC_VPORT_SCISSOR_0_TL
       0x40004000, // PA_SC_VPORT_SCISSOR_0_BR
       0x80000000, // PA_SC_VPORT_SCISSOR_1_TL
@@ -1623,583 +1636,585 @@ static void gfx10_emulate_clear_state(struct radeon_cmdbuf *cs, unsigned num_reg
       0x40004000, // PA_SC_VPORT_SCISSOR_14_BR
       0x80000000, // PA_SC_VPORT_SCISSOR_15_TL
       0x40004000, // PA_SC_VPORT_SCISSOR_15_BR
-      0x0       , // PA_SC_VPORT_ZMIN_0
+      0x0,        // PA_SC_VPORT_ZMIN_0
       0x3f800000, // PA_SC_VPORT_ZMAX_0
-      0x0       , // PA_SC_VPORT_ZMIN_1
+      0x0,        // PA_SC_VPORT_ZMIN_1
       0x3f800000, // PA_SC_VPORT_ZMAX_1
-      0x0       , // PA_SC_VPORT_ZMIN_2
+      0x0,        // PA_SC_VPORT_ZMIN_2
       0x3f800000, // PA_SC_VPORT_ZMAX_2
-      0x0       , // PA_SC_VPORT_ZMIN_3
+      0x0,        // PA_SC_VPORT_ZMIN_3
       0x3f800000, // PA_SC_VPORT_ZMAX_3
-      0x0       , // PA_SC_VPORT_ZMIN_4
+      0x0,        // PA_SC_VPORT_ZMIN_4
       0x3f800000, // PA_SC_VPORT_ZMAX_4
-      0x0       , // PA_SC_VPORT_ZMIN_5
+      0x0,        // PA_SC_VPORT_ZMIN_5
       0x3f800000, // PA_SC_VPORT_ZMAX_5
-      0x0       , // PA_SC_VPORT_ZMIN_6
+      0x0,        // PA_SC_VPORT_ZMIN_6
       0x3f800000, // PA_SC_VPORT_ZMAX_6
-      0x0       , // PA_SC_VPORT_ZMIN_7
+      0x0,        // PA_SC_VPORT_ZMIN_7
       0x3f800000, // PA_SC_VPORT_ZMAX_7
-      0x0       , // PA_SC_VPORT_ZMIN_8
+      0x0,        // PA_SC_VPORT_ZMIN_8
       0x3f800000, // PA_SC_VPORT_ZMAX_8
-      0x0       , // PA_SC_VPORT_ZMIN_9
+      0x0,        // PA_SC_VPORT_ZMIN_9
       0x3f800000, // PA_SC_VPORT_ZMAX_9
-      0x0       , // PA_SC_VPORT_ZMIN_10
+      0x0,        // PA_SC_VPORT_ZMIN_10
       0x3f800000, // PA_SC_VPORT_ZMAX_10
-      0x0       , // PA_SC_VPORT_ZMIN_11
+      0x0,        // PA_SC_VPORT_ZMIN_11
       0x3f800000, // PA_SC_VPORT_ZMAX_11
-      0x0       , // PA_SC_VPORT_ZMIN_12
+      0x0,        // PA_SC_VPORT_ZMIN_12
       0x3f800000, // PA_SC_VPORT_ZMAX_12
-      0x0       , // PA_SC_VPORT_ZMIN_13
+      0x0,        // PA_SC_VPORT_ZMIN_13
       0x3f800000, // PA_SC_VPORT_ZMAX_13
-      0x0       , // PA_SC_VPORT_ZMIN_14
+      0x0,        // PA_SC_VPORT_ZMIN_14
       0x3f800000, // PA_SC_VPORT_ZMAX_14
-      0x0       , // PA_SC_VPORT_ZMIN_15
+      0x0,        // PA_SC_VPORT_ZMIN_15
       0x3f800000, // PA_SC_VPORT_ZMAX_15
-      0x0       , // PA_SC_RASTER_CONFIG
-      0x0       , // PA_SC_RASTER_CONFIG_1
-      0x0       , //
+      0x0,        // PA_SC_RASTER_CONFIG
+      0x0,        // PA_SC_RASTER_CONFIG_1
+      0x0,        //
       0x0         // PA_SC_TILE_STEERING_OVERRIDE
    };
    static const uint32_t VgtMultiPrimIbResetIndxNv10[] = {
-      0x0       , // VGT_MULTI_PRIM_IB_RESET_INDX
-      0x0       , // CB_RMI_GL2_CACHE_CONTROL
-      0x0       , // CB_BLEND_RED
-      0x0       , // CB_BLEND_GREEN
-      0x0       , // CB_BLEND_BLUE
-      0x0       , // CB_BLEND_ALPHA
-      0x0       , // CB_DCC_CONTROL
-      0x0       , // CB_COVERAGE_OUT_CONTROL
-      0x0       , // DB_STENCIL_CONTROL
-      0x1000000 , // DB_STENCILREFMASK
-      0x1000000 , // DB_STENCILREFMASK_BF
-      0x0       , //
-      0x0       , // PA_CL_VPORT_XSCALE
-      0x0       , // PA_CL_VPORT_XOFFSET
-      0x0       , // PA_CL_VPORT_YSCALE
-      0x0       , // PA_CL_VPORT_YOFFSET
-      0x0       , // PA_CL_VPORT_ZSCALE
-      0x0       , // PA_CL_VPORT_ZOFFSET
-      0x0       , // PA_CL_VPORT_XSCALE_1
-      0x0       , // PA_CL_VPORT_XOFFSET_1
-      0x0       , // PA_CL_VPORT_YSCALE_1
-      0x0       , // PA_CL_VPORT_YOFFSET_1
-      0x0       , // PA_CL_VPORT_ZSCALE_1
-      0x0       , // PA_CL_VPORT_ZOFFSET_1
-      0x0       , // PA_CL_VPORT_XSCALE_2
-      0x0       , // PA_CL_VPORT_XOFFSET_2
-      0x0       , // PA_CL_VPORT_YSCALE_2
-      0x0       , // PA_CL_VPORT_YOFFSET_2
-      0x0       , // PA_CL_VPORT_ZSCALE_2
-      0x0       , // PA_CL_VPORT_ZOFFSET_2
-      0x0       , // PA_CL_VPORT_XSCALE_3
-      0x0       , // PA_CL_VPORT_XOFFSET_3
-      0x0       , // PA_CL_VPORT_YSCALE_3
-      0x0       , // PA_CL_VPORT_YOFFSET_3
-      0x0       , // PA_CL_VPORT_ZSCALE_3
-      0x0       , // PA_CL_VPORT_ZOFFSET_3
-      0x0       , // PA_CL_VPORT_XSCALE_4
-      0x0       , // PA_CL_VPORT_XOFFSET_4
-      0x0       , // PA_CL_VPORT_YSCALE_4
-      0x0       , // PA_CL_VPORT_YOFFSET_4
-      0x0       , // PA_CL_VPORT_ZSCALE_4
-      0x0       , // PA_CL_VPORT_ZOFFSET_4
-      0x0       , // PA_CL_VPORT_XSCALE_5
-      0x0       , // PA_CL_VPORT_XOFFSET_5
-      0x0       , // PA_CL_VPORT_YSCALE_5
-      0x0       , // PA_CL_VPORT_YOFFSET_5
-      0x0       , // PA_CL_VPORT_ZSCALE_5
-      0x0       , // PA_CL_VPORT_ZOFFSET_5
-      0x0       , // PA_CL_VPORT_XSCALE_6
-      0x0       , // PA_CL_VPORT_XOFFSET_6
-      0x0       , // PA_CL_VPORT_YSCALE_6
-      0x0       , // PA_CL_VPORT_YOFFSET_6
-      0x0       , // PA_CL_VPORT_ZSCALE_6
-      0x0       , // PA_CL_VPORT_ZOFFSET_6
-      0x0       , // PA_CL_VPORT_XSCALE_7
-      0x0       , // PA_CL_VPORT_XOFFSET_7
-      0x0       , // PA_CL_VPORT_YSCALE_7
-      0x0       , // PA_CL_VPORT_YOFFSET_7
-      0x0       , // PA_CL_VPORT_ZSCALE_7
-      0x0       , // PA_CL_VPORT_ZOFFSET_7
-      0x0       , // PA_CL_VPORT_XSCALE_8
-      0x0       , // PA_CL_VPORT_XOFFSET_8
-      0x0       , // PA_CL_VPORT_YSCALE_8
-      0x0       , // PA_CL_VPORT_YOFFSET_8
-      0x0       , // PA_CL_VPORT_ZSCALE_8
-      0x0       , // PA_CL_VPORT_ZOFFSET_8
-      0x0       , // PA_CL_VPORT_XSCALE_9
-      0x0       , // PA_CL_VPORT_XOFFSET_9
-      0x0       , // PA_CL_VPORT_YSCALE_9
-      0x0       , // PA_CL_VPORT_YOFFSET_9
-      0x0       , // PA_CL_VPORT_ZSCALE_9
-      0x0       , // PA_CL_VPORT_ZOFFSET_9
-      0x0       , // PA_CL_VPORT_XSCALE_10
-      0x0       , // PA_CL_VPORT_XOFFSET_10
-      0x0       , // PA_CL_VPORT_YSCALE_10
-      0x0       , // PA_CL_VPORT_YOFFSET_10
-      0x0       , // PA_CL_VPORT_ZSCALE_10
-      0x0       , // PA_CL_VPORT_ZOFFSET_10
-      0x0       , // PA_CL_VPORT_XSCALE_11
-      0x0       , // PA_CL_VPORT_XOFFSET_11
-      0x0       , // PA_CL_VPORT_YSCALE_11
-      0x0       , // PA_CL_VPORT_YOFFSET_11
-      0x0       , // PA_CL_VPORT_ZSCALE_11
-      0x0       , // PA_CL_VPORT_ZOFFSET_11
-      0x0       , // PA_CL_VPORT_XSCALE_12
-      0x0       , // PA_CL_VPORT_XOFFSET_12
-      0x0       , // PA_CL_VPORT_YSCALE_12
-      0x0       , // PA_CL_VPORT_YOFFSET_12
-      0x0       , // PA_CL_VPORT_ZSCALE_12
-      0x0       , // PA_CL_VPORT_ZOFFSET_12
-      0x0       , // PA_CL_VPORT_XSCALE_13
-      0x0       , // PA_CL_VPORT_XOFFSET_13
-      0x0       , // PA_CL_VPORT_YSCALE_13
-      0x0       , // PA_CL_VPORT_YOFFSET_13
-      0x0       , // PA_CL_VPORT_ZSCALE_13
-      0x0       , // PA_CL_VPORT_ZOFFSET_13
-      0x0       , // PA_CL_VPORT_XSCALE_14
-      0x0       , // PA_CL_VPORT_XOFFSET_14
-      0x0       , // PA_CL_VPORT_YSCALE_14
-      0x0       , // PA_CL_VPORT_YOFFSET_14
-      0x0       , // PA_CL_VPORT_ZSCALE_14
-      0x0       , // PA_CL_VPORT_ZOFFSET_14
-      0x0       , // PA_CL_VPORT_XSCALE_15
-      0x0       , // PA_CL_VPORT_XOFFSET_15
-      0x0       , // PA_CL_VPORT_YSCALE_15
-      0x0       , // PA_CL_VPORT_YOFFSET_15
-      0x0       , // PA_CL_VPORT_ZSCALE_15
-      0x0       , // PA_CL_VPORT_ZOFFSET_15
-      0x0       , // PA_CL_UCP_0_X
-      0x0       , // PA_CL_UCP_0_Y
-      0x0       , // PA_CL_UCP_0_Z
-      0x0       , // PA_CL_UCP_0_W
-      0x0       , // PA_CL_UCP_1_X
-      0x0       , // PA_CL_UCP_1_Y
-      0x0       , // PA_CL_UCP_1_Z
-      0x0       , // PA_CL_UCP_1_W
-      0x0       , // PA_CL_UCP_2_X
-      0x0       , // PA_CL_UCP_2_Y
-      0x0       , // PA_CL_UCP_2_Z
-      0x0       , // PA_CL_UCP_2_W
-      0x0       , // PA_CL_UCP_3_X
-      0x0       , // PA_CL_UCP_3_Y
-      0x0       , // PA_CL_UCP_3_Z
-      0x0       , // PA_CL_UCP_3_W
-      0x0       , // PA_CL_UCP_4_X
-      0x0       , // PA_CL_UCP_4_Y
-      0x0       , // PA_CL_UCP_4_Z
-      0x0       , // PA_CL_UCP_4_W
-      0x0       , // PA_CL_UCP_5_X
-      0x0       , // PA_CL_UCP_5_Y
-      0x0       , // PA_CL_UCP_5_Z
-      0x0         // PA_CL_UCP_5_W
+      0x0,       // VGT_MULTI_PRIM_IB_RESET_INDX
+      0x0,       // CB_RMI_GL2_CACHE_CONTROL
+      0x0,       // CB_BLEND_RED
+      0x0,       // CB_BLEND_GREEN
+      0x0,       // CB_BLEND_BLUE
+      0x0,       // CB_BLEND_ALPHA
+      0x0,       // CB_DCC_CONTROL
+      0x0,       // CB_COVERAGE_OUT_CONTROL
+      0x0,       // DB_STENCIL_CONTROL
+      0x1000000, // DB_STENCILREFMASK
+      0x1000000, // DB_STENCILREFMASK_BF
+      0x0,       //
+      0x0,       // PA_CL_VPORT_XSCALE
+      0x0,       // PA_CL_VPORT_XOFFSET
+      0x0,       // PA_CL_VPORT_YSCALE
+      0x0,       // PA_CL_VPORT_YOFFSET
+      0x0,       // PA_CL_VPORT_ZSCALE
+      0x0,       // PA_CL_VPORT_ZOFFSET
+      0x0,       // PA_CL_VPORT_XSCALE_1
+      0x0,       // PA_CL_VPORT_XOFFSET_1
+      0x0,       // PA_CL_VPORT_YSCALE_1
+      0x0,       // PA_CL_VPORT_YOFFSET_1
+      0x0,       // PA_CL_VPORT_ZSCALE_1
+      0x0,       // PA_CL_VPORT_ZOFFSET_1
+      0x0,       // PA_CL_VPORT_XSCALE_2
+      0x0,       // PA_CL_VPORT_XOFFSET_2
+      0x0,       // PA_CL_VPORT_YSCALE_2
+      0x0,       // PA_CL_VPORT_YOFFSET_2
+      0x0,       // PA_CL_VPORT_ZSCALE_2
+      0x0,       // PA_CL_VPORT_ZOFFSET_2
+      0x0,       // PA_CL_VPORT_XSCALE_3
+      0x0,       // PA_CL_VPORT_XOFFSET_3
+      0x0,       // PA_CL_VPORT_YSCALE_3
+      0x0,       // PA_CL_VPORT_YOFFSET_3
+      0x0,       // PA_CL_VPORT_ZSCALE_3
+      0x0,       // PA_CL_VPORT_ZOFFSET_3
+      0x0,       // PA_CL_VPORT_XSCALE_4
+      0x0,       // PA_CL_VPORT_XOFFSET_4
+      0x0,       // PA_CL_VPORT_YSCALE_4
+      0x0,       // PA_CL_VPORT_YOFFSET_4
+      0x0,       // PA_CL_VPORT_ZSCALE_4
+      0x0,       // PA_CL_VPORT_ZOFFSET_4
+      0x0,       // PA_CL_VPORT_XSCALE_5
+      0x0,       // PA_CL_VPORT_XOFFSET_5
+      0x0,       // PA_CL_VPORT_YSCALE_5
+      0x0,       // PA_CL_VPORT_YOFFSET_5
+      0x0,       // PA_CL_VPORT_ZSCALE_5
+      0x0,       // PA_CL_VPORT_ZOFFSET_5
+      0x0,       // PA_CL_VPORT_XSCALE_6
+      0x0,       // PA_CL_VPORT_XOFFSET_6
+      0x0,       // PA_CL_VPORT_YSCALE_6
+      0x0,       // PA_CL_VPORT_YOFFSET_6
+      0x0,       // PA_CL_VPORT_ZSCALE_6
+      0x0,       // PA_CL_VPORT_ZOFFSET_6
+      0x0,       // PA_CL_VPORT_XSCALE_7
+      0x0,       // PA_CL_VPORT_XOFFSET_7
+      0x0,       // PA_CL_VPORT_YSCALE_7
+      0x0,       // PA_CL_VPORT_YOFFSET_7
+      0x0,       // PA_CL_VPORT_ZSCALE_7
+      0x0,       // PA_CL_VPORT_ZOFFSET_7
+      0x0,       // PA_CL_VPORT_XSCALE_8
+      0x0,       // PA_CL_VPORT_XOFFSET_8
+      0x0,       // PA_CL_VPORT_YSCALE_8
+      0x0,       // PA_CL_VPORT_YOFFSET_8
+      0x0,       // PA_CL_VPORT_ZSCALE_8
+      0x0,       // PA_CL_VPORT_ZOFFSET_8
+      0x0,       // PA_CL_VPORT_XSCALE_9
+      0x0,       // PA_CL_VPORT_XOFFSET_9
+      0x0,       // PA_CL_VPORT_YSCALE_9
+      0x0,       // PA_CL_VPORT_YOFFSET_9
+      0x0,       // PA_CL_VPORT_ZSCALE_9
+      0x0,       // PA_CL_VPORT_ZOFFSET_9
+      0x0,       // PA_CL_VPORT_XSCALE_10
+      0x0,       // PA_CL_VPORT_XOFFSET_10
+      0x0,       // PA_CL_VPORT_YSCALE_10
+      0x0,       // PA_CL_VPORT_YOFFSET_10
+      0x0,       // PA_CL_VPORT_ZSCALE_10
+      0x0,       // PA_CL_VPORT_ZOFFSET_10
+      0x0,       // PA_CL_VPORT_XSCALE_11
+      0x0,       // PA_CL_VPORT_XOFFSET_11
+      0x0,       // PA_CL_VPORT_YSCALE_11
+      0x0,       // PA_CL_VPORT_YOFFSET_11
+      0x0,       // PA_CL_VPORT_ZSCALE_11
+      0x0,       // PA_CL_VPORT_ZOFFSET_11
+      0x0,       // PA_CL_VPORT_XSCALE_12
+      0x0,       // PA_CL_VPORT_XOFFSET_12
+      0x0,       // PA_CL_VPORT_YSCALE_12
+      0x0,       // PA_CL_VPORT_YOFFSET_12
+      0x0,       // PA_CL_VPORT_ZSCALE_12
+      0x0,       // PA_CL_VPORT_ZOFFSET_12
+      0x0,       // PA_CL_VPORT_XSCALE_13
+      0x0,       // PA_CL_VPORT_XOFFSET_13
+      0x0,       // PA_CL_VPORT_YSCALE_13
+      0x0,       // PA_CL_VPORT_YOFFSET_13
+      0x0,       // PA_CL_VPORT_ZSCALE_13
+      0x0,       // PA_CL_VPORT_ZOFFSET_13
+      0x0,       // PA_CL_VPORT_XSCALE_14
+      0x0,       // PA_CL_VPORT_XOFFSET_14
+      0x0,       // PA_CL_VPORT_YSCALE_14
+      0x0,       // PA_CL_VPORT_YOFFSET_14
+      0x0,       // PA_CL_VPORT_ZSCALE_14
+      0x0,       // PA_CL_VPORT_ZOFFSET_14
+      0x0,       // PA_CL_VPORT_XSCALE_15
+      0x0,       // PA_CL_VPORT_XOFFSET_15
+      0x0,       // PA_CL_VPORT_YSCALE_15
+      0x0,       // PA_CL_VPORT_YOFFSET_15
+      0x0,       // PA_CL_VPORT_ZSCALE_15
+      0x0,       // PA_CL_VPORT_ZOFFSET_15
+      0x0,       // PA_CL_UCP_0_X
+      0x0,       // PA_CL_UCP_0_Y
+      0x0,       // PA_CL_UCP_0_Z
+      0x0,       // PA_CL_UCP_0_W
+      0x0,       // PA_CL_UCP_1_X
+      0x0,       // PA_CL_UCP_1_Y
+      0x0,       // PA_CL_UCP_1_Z
+      0x0,       // PA_CL_UCP_1_W
+      0x0,       // PA_CL_UCP_2_X
+      0x0,       // PA_CL_UCP_2_Y
+      0x0,       // PA_CL_UCP_2_Z
+      0x0,       // PA_CL_UCP_2_W
+      0x0,       // PA_CL_UCP_3_X
+      0x0,       // PA_CL_UCP_3_Y
+      0x0,       // PA_CL_UCP_3_Z
+      0x0,       // PA_CL_UCP_3_W
+      0x0,       // PA_CL_UCP_4_X
+      0x0,       // PA_CL_UCP_4_Y
+      0x0,       // PA_CL_UCP_4_Z
+      0x0,       // PA_CL_UCP_4_W
+      0x0,       // PA_CL_UCP_5_X
+      0x0,       // PA_CL_UCP_5_Y
+      0x0,       // PA_CL_UCP_5_Z
+      0x0        // PA_CL_UCP_5_W
    };
    static const uint32_t SpiPsInputCntl0Nv10[] = {
-      0x0       , // SPI_PS_INPUT_CNTL_0
-      0x0       , // SPI_PS_INPUT_CNTL_1
-      0x0       , // SPI_PS_INPUT_CNTL_2
-      0x0       , // SPI_PS_INPUT_CNTL_3
-      0x0       , // SPI_PS_INPUT_CNTL_4
-      0x0       , // SPI_PS_INPUT_CNTL_5
-      0x0       , // SPI_PS_INPUT_CNTL_6
-      0x0       , // SPI_PS_INPUT_CNTL_7
-      0x0       , // SPI_PS_INPUT_CNTL_8
-      0x0       , // SPI_PS_INPUT_CNTL_9
-      0x0       , // SPI_PS_INPUT_CNTL_10
-      0x0       , // SPI_PS_INPUT_CNTL_11
-      0x0       , // SPI_PS_INPUT_CNTL_12
-      0x0       , // SPI_PS_INPUT_CNTL_13
-      0x0       , // SPI_PS_INPUT_CNTL_14
-      0x0       , // SPI_PS_INPUT_CNTL_15
-      0x0       , // SPI_PS_INPUT_CNTL_16
-      0x0       , // SPI_PS_INPUT_CNTL_17
-      0x0       , // SPI_PS_INPUT_CNTL_18
-      0x0       , // SPI_PS_INPUT_CNTL_19
-      0x0       , // SPI_PS_INPUT_CNTL_20
-      0x0       , // SPI_PS_INPUT_CNTL_21
-      0x0       , // SPI_PS_INPUT_CNTL_22
-      0x0       , // SPI_PS_INPUT_CNTL_23
-      0x0       , // SPI_PS_INPUT_CNTL_24
-      0x0       , // SPI_PS_INPUT_CNTL_25
-      0x0       , // SPI_PS_INPUT_CNTL_26
-      0x0       , // SPI_PS_INPUT_CNTL_27
-      0x0       , // SPI_PS_INPUT_CNTL_28
-      0x0       , // SPI_PS_INPUT_CNTL_29
-      0x0       , // SPI_PS_INPUT_CNTL_30
-      0x0       , // SPI_PS_INPUT_CNTL_31
-      0x0       , // SPI_VS_OUT_CONFIG
-      0x0       , //
-      0x0       , // SPI_PS_INPUT_ENA
-      0x0       , // SPI_PS_INPUT_ADDR
-      0x0       , // SPI_INTERP_CONTROL_0
-      0x2       , // SPI_PS_IN_CONTROL
-      0x0       , //
-      0x0       , // SPI_BARYC_CNTL
-      0x0       , //
-      0x0       , // SPI_TMPRING_SIZE
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , // SPI_SHADER_IDX_FORMAT
-      0x0       , // SPI_SHADER_POS_FORMAT
-      0x0       , // SPI_SHADER_Z_FORMAT
-      0x0         // SPI_SHADER_COL_FORMAT
+      0x0, // SPI_PS_INPUT_CNTL_0
+      0x0, // SPI_PS_INPUT_CNTL_1
+      0x0, // SPI_PS_INPUT_CNTL_2
+      0x0, // SPI_PS_INPUT_CNTL_3
+      0x0, // SPI_PS_INPUT_CNTL_4
+      0x0, // SPI_PS_INPUT_CNTL_5
+      0x0, // SPI_PS_INPUT_CNTL_6
+      0x0, // SPI_PS_INPUT_CNTL_7
+      0x0, // SPI_PS_INPUT_CNTL_8
+      0x0, // SPI_PS_INPUT_CNTL_9
+      0x0, // SPI_PS_INPUT_CNTL_10
+      0x0, // SPI_PS_INPUT_CNTL_11
+      0x0, // SPI_PS_INPUT_CNTL_12
+      0x0, // SPI_PS_INPUT_CNTL_13
+      0x0, // SPI_PS_INPUT_CNTL_14
+      0x0, // SPI_PS_INPUT_CNTL_15
+      0x0, // SPI_PS_INPUT_CNTL_16
+      0x0, // SPI_PS_INPUT_CNTL_17
+      0x0, // SPI_PS_INPUT_CNTL_18
+      0x0, // SPI_PS_INPUT_CNTL_19
+      0x0, // SPI_PS_INPUT_CNTL_20
+      0x0, // SPI_PS_INPUT_CNTL_21
+      0x0, // SPI_PS_INPUT_CNTL_22
+      0x0, // SPI_PS_INPUT_CNTL_23
+      0x0, // SPI_PS_INPUT_CNTL_24
+      0x0, // SPI_PS_INPUT_CNTL_25
+      0x0, // SPI_PS_INPUT_CNTL_26
+      0x0, // SPI_PS_INPUT_CNTL_27
+      0x0, // SPI_PS_INPUT_CNTL_28
+      0x0, // SPI_PS_INPUT_CNTL_29
+      0x0, // SPI_PS_INPUT_CNTL_30
+      0x0, // SPI_PS_INPUT_CNTL_31
+      0x0, // SPI_VS_OUT_CONFIG
+      0x0, //
+      0x0, // SPI_PS_INPUT_ENA
+      0x0, // SPI_PS_INPUT_ADDR
+      0x0, // SPI_INTERP_CONTROL_0
+      0x2, // SPI_PS_IN_CONTROL
+      0x0, //
+      0x0, // SPI_BARYC_CNTL
+      0x0, //
+      0x0, // SPI_TMPRING_SIZE
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, // SPI_SHADER_IDX_FORMAT
+      0x0, // SPI_SHADER_POS_FORMAT
+      0x0, // SPI_SHADER_Z_FORMAT
+      0x0  // SPI_SHADER_COL_FORMAT
    };
    static const uint32_t SxPsDownconvertNv10[] = {
-      0x0       , // SX_PS_DOWNCONVERT
-      0x0       , // SX_BLEND_OPT_EPSILON
-      0x0       , // SX_BLEND_OPT_CONTROL
-      0x0       , // SX_MRT0_BLEND_OPT
-      0x0       , // SX_MRT1_BLEND_OPT
-      0x0       , // SX_MRT2_BLEND_OPT
-      0x0       , // SX_MRT3_BLEND_OPT
-      0x0       , // SX_MRT4_BLEND_OPT
-      0x0       , // SX_MRT5_BLEND_OPT
-      0x0       , // SX_MRT6_BLEND_OPT
-      0x0       , // SX_MRT7_BLEND_OPT
-      0x0       , // CB_BLEND0_CONTROL
-      0x0       , // CB_BLEND1_CONTROL
-      0x0       , // CB_BLEND2_CONTROL
-      0x0       , // CB_BLEND3_CONTROL
-      0x0       , // CB_BLEND4_CONTROL
-      0x0       , // CB_BLEND5_CONTROL
-      0x0       , // CB_BLEND6_CONTROL
-      0x0         // CB_BLEND7_CONTROL
+      0x0, // SX_PS_DOWNCONVERT
+      0x0, // SX_BLEND_OPT_EPSILON
+      0x0, // SX_BLEND_OPT_CONTROL
+      0x0, // SX_MRT0_BLEND_OPT
+      0x0, // SX_MRT1_BLEND_OPT
+      0x0, // SX_MRT2_BLEND_OPT
+      0x0, // SX_MRT3_BLEND_OPT
+      0x0, // SX_MRT4_BLEND_OPT
+      0x0, // SX_MRT5_BLEND_OPT
+      0x0, // SX_MRT6_BLEND_OPT
+      0x0, // SX_MRT7_BLEND_OPT
+      0x0, // CB_BLEND0_CONTROL
+      0x0, // CB_BLEND1_CONTROL
+      0x0, // CB_BLEND2_CONTROL
+      0x0, // CB_BLEND3_CONTROL
+      0x0, // CB_BLEND4_CONTROL
+      0x0, // CB_BLEND5_CONTROL
+      0x0, // CB_BLEND6_CONTROL
+      0x0  // CB_BLEND7_CONTROL
    };
    static const uint32_t GeMaxOutputPerSubgroupNv10[] = {
-      0x0       , // GE_MAX_OUTPUT_PER_SUBGROUP
-      0x0       , // DB_DEPTH_CONTROL
-      0x0       , // DB_EQAA
-      0x0       , // CB_COLOR_CONTROL
-      0x0       , // DB_SHADER_CONTROL
-      0x90000   , // PA_CL_CLIP_CNTL
-      0x4       , // PA_SU_SC_MODE_CNTL
-      0x0       , // PA_CL_VTE_CNTL
-      0x0       , // PA_CL_VS_OUT_CNTL
-      0x0         // PA_CL_NANINF_CNTL
+      0x0,     // GE_MAX_OUTPUT_PER_SUBGROUP
+      0x0,     // DB_DEPTH_CONTROL
+      0x0,     // DB_EQAA
+      0x0,     // CB_COLOR_CONTROL
+      0x0,     // DB_SHADER_CONTROL
+      0x90000, // PA_CL_CLIP_CNTL
+      0x4,     // PA_SU_SC_MODE_CNTL
+      0x0,     // PA_CL_VTE_CNTL
+      0x0,     // PA_CL_VS_OUT_CNTL
+      0x0      // PA_CL_NANINF_CNTL
    };
    static const uint32_t PaSuPrimFilterCntlNv10[] = {
-      0x0       , // PA_SU_PRIM_FILTER_CNTL
-      0x0       , // PA_SU_SMALL_PRIM_FILTER_CNTL
-      0x0       , // PA_CL_OBJPRIM_ID_CNTL
-      0x0       , // PA_CL_NGG_CNTL
-      0x0       , // PA_SU_OVER_RASTERIZATION_CNTL
-      0x0       , // PA_STEREO_CNTL
-      0x0         // PA_STATE_STEREO_X
+      0x0, // PA_SU_PRIM_FILTER_CNTL
+      0x0, // PA_SU_SMALL_PRIM_FILTER_CNTL
+      0x0, // PA_CL_OBJPRIM_ID_CNTL
+      0x0, // PA_CL_NGG_CNTL
+      0x0, // PA_SU_OVER_RASTERIZATION_CNTL
+      0x0, // PA_STEREO_CNTL
+      0x0  // PA_STATE_STEREO_X
    };
    static const uint32_t PaSuPointSizeNv10[] = {
-      0x0       , // PA_SU_POINT_SIZE
-      0x0       , // PA_SU_POINT_MINMAX
-      0x0       , // PA_SU_LINE_CNTL
-      0x0         // PA_SC_LINE_STIPPLE
+      0x0, // PA_SU_POINT_SIZE
+      0x0, // PA_SU_POINT_MINMAX
+      0x0, // PA_SU_LINE_CNTL
+      0x0  // PA_SC_LINE_STIPPLE
    };
    static const uint32_t VgtHosMaxTessLevelNv10[] = {
-      0x0       , // VGT_HOS_MAX_TESS_LEVEL
-      0x0         // VGT_HOS_MIN_TESS_LEVEL
+      0x0, // VGT_HOS_MAX_TESS_LEVEL
+      0x0  // VGT_HOS_MIN_TESS_LEVEL
    };
    static const uint32_t VgtGsModeNv10[] = {
-      0x0       , // VGT_GS_MODE
-      0x0       , // VGT_GS_ONCHIP_CNTL
-      0x0       , // PA_SC_MODE_CNTL_0
-      0x0       , // PA_SC_MODE_CNTL_1
-      0x0       , // VGT_ENHANCE
-      0x100     , // VGT_GS_PER_ES
-      0x80      , // VGT_ES_PER_GS
-      0x2       , // VGT_GS_PER_VS
-      0x0       , // VGT_GSVS_RING_OFFSET_1
-      0x0       , // VGT_GSVS_RING_OFFSET_2
-      0x0       , // VGT_GSVS_RING_OFFSET_3
-      0x0         // VGT_GS_OUT_PRIM_TYPE
+      0x0,   // VGT_GS_MODE
+      0x0,   // VGT_GS_ONCHIP_CNTL
+      0x0,   // PA_SC_MODE_CNTL_0
+      0x0,   // PA_SC_MODE_CNTL_1
+      0x0,   // VGT_ENHANCE
+      0x100, // VGT_GS_PER_ES
+      0x80 // VGT_ES_PER_GS
+      0x2,   // VGT_GS_PER_VS
+      0x0,   // VGT_GSVS_RING_OFFSET_1
+      0x0,   // VGT_GSVS_RING_OFFSET_2
+      0x0,   // VGT_GSVS_RING_OFFSET_3
+      0x0    // VGT_GS_OUT_PRIM_TYPE
    };
    static const uint32_t VgtPrimitiveidEnNv10[] = {
-      0x0         // VGT_PRIMITIVEID_EN
+      0x0 // VGT_PRIMITIVEID_EN
    };
    static const uint32_t VgtPrimitiveidResetNv10[] = {
-      0x0         // VGT_PRIMITIVEID_RESET
+      0x0 // VGT_PRIMITIVEID_RESET
    };
    static const uint32_t VgtDrawPayloadCntlNv10[] = {
-      0x0       , // VGT_DRAW_PAYLOAD_CNTL
-      0x0       , //
-      0x0       , // VGT_INSTANCE_STEP_RATE_0
-      0x0       , // VGT_INSTANCE_STEP_RATE_1
-      0x0       , // IA_MULTI_VGT_PARAM
-      0x0       , // VGT_ESGS_RING_ITEMSIZE
-      0x0       , // VGT_GSVS_RING_ITEMSIZE
-      0x0       , // VGT_REUSE_OFF
-      0x0       , // VGT_VTX_CNT_EN
-      0x0       , // DB_HTILE_SURFACE
-      0x0       , // DB_SRESULTS_COMPARE_STATE0
-      0x0       , // DB_SRESULTS_COMPARE_STATE1
-      0x0       , // DB_PRELOAD_CONTROL
-      0x0       , //
-      0x0       , // VGT_STRMOUT_BUFFER_SIZE_0
-      0x0       , // VGT_STRMOUT_VTX_STRIDE_0
-      0x0       , //
-      0x0       , // VGT_STRMOUT_BUFFER_OFFSET_0
-      0x0       , // VGT_STRMOUT_BUFFER_SIZE_1
-      0x0       , // VGT_STRMOUT_VTX_STRIDE_1
-      0x0       , //
-      0x0       , // VGT_STRMOUT_BUFFER_OFFSET_1
-      0x0       , // VGT_STRMOUT_BUFFER_SIZE_2
-      0x0       , // VGT_STRMOUT_VTX_STRIDE_2
-      0x0       , //
-      0x0       , // VGT_STRMOUT_BUFFER_OFFSET_2
-      0x0       , // VGT_STRMOUT_BUFFER_SIZE_3
-      0x0       , // VGT_STRMOUT_VTX_STRIDE_3
-      0x0       , //
-      0x0       , // VGT_STRMOUT_BUFFER_OFFSET_3
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , // VGT_STRMOUT_DRAW_OPAQUE_OFFSET
-      0x0       , // VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE
-      0x0       , // VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE
-      0x0       , //
-      0x0       , // VGT_GS_MAX_VERT_OUT
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , // GE_NGG_SUBGRP_CNTL
-      0x0       , // VGT_TESS_DISTRIBUTION
-      0x0       , // VGT_SHADER_STAGES_EN
-      0x0       , // VGT_LS_HS_CONFIG
-      0x0       , // VGT_GS_VERT_ITEMSIZE
-      0x0       , // VGT_GS_VERT_ITEMSIZE_1
-      0x0       , // VGT_GS_VERT_ITEMSIZE_2
-      0x0       , // VGT_GS_VERT_ITEMSIZE_3
-      0x0       , // VGT_TF_PARAM
-      0x0       , // DB_ALPHA_TO_MASK
-      0x0       , // VGT_DISPATCH_DRAW_INDEX
-      0x0       , // PA_SU_POLY_OFFSET_DB_FMT_CNTL
-      0x0       , // PA_SU_POLY_OFFSET_CLAMP
-      0x0       , // PA_SU_POLY_OFFSET_FRONT_SCALE
-      0x0       , // PA_SU_POLY_OFFSET_FRONT_OFFSET
-      0x0       , // PA_SU_POLY_OFFSET_BACK_SCALE
-      0x0       , // PA_SU_POLY_OFFSET_BACK_OFFSET
-      0x0       , // VGT_GS_INSTANCE_CNT
-      0x0       , // VGT_STRMOUT_CONFIG
-      0x0         // VGT_STRMOUT_BUFFER_CONFIG
+      0x0, // VGT_DRAW_PAYLOAD_CNTL
+      0x0, //
+      0x0, // VGT_INSTANCE_STEP_RATE_0
+      0x0, // VGT_INSTANCE_STEP_RATE_1
+      0x0, // IA_MULTI_VGT_PARAM
+      0x0, // VGT_ESGS_RING_ITEMSIZE
+      0x0, // VGT_GSVS_RING_ITEMSIZE
+      0x0, // VGT_REUSE_OFF
+      0x0, // VGT_VTX_CNT_EN
+      0x0, // DB_HTILE_SURFACE
+      0x0, // DB_SRESULTS_COMPARE_STATE0
+      0x0, // DB_SRESULTS_COMPARE_STATE1
+      0x0, // DB_PRELOAD_CONTROL
+      0x0, //
+      0x0, // VGT_STRMOUT_BUFFER_SIZE_0
+      0x0, // VGT_STRMOUT_VTX_STRIDE_0
+      0x0, //
+      0x0, // VGT_STRMOUT_BUFFER_OFFSET_0
+      0x0, // VGT_STRMOUT_BUFFER_SIZE_1
+      0x0, // VGT_STRMOUT_VTX_STRIDE_1
+      0x0, //
+      0x0, // VGT_STRMOUT_BUFFER_OFFSET_1
+      0x0, // VGT_STRMOUT_BUFFER_SIZE_2
+      0x0, // VGT_STRMOUT_VTX_STRIDE_2
+      0x0, //
+      0x0, // VGT_STRMOUT_BUFFER_OFFSET_2
+      0x0, // VGT_STRMOUT_BUFFER_SIZE_3
+      0x0, // VGT_STRMOUT_VTX_STRIDE_3
+      0x0, //
+      0x0, // VGT_STRMOUT_BUFFER_OFFSET_3
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, // VGT_STRMOUT_DRAW_OPAQUE_OFFSET
+      0x0, // VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE
+      0x0, // VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE
+      0x0, //
+      0x0, // VGT_GS_MAX_VERT_OUT
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, // GE_NGG_SUBGRP_CNTL
+      0x0, // VGT_TESS_DISTRIBUTION
+      0x0, // VGT_SHADER_STAGES_EN
+      0x0, // VGT_LS_HS_CONFIG
+      0x0, // VGT_GS_VERT_ITEMSIZE
+      0x0, // VGT_GS_VERT_ITEMSIZE_1
+      0x0, // VGT_GS_VERT_ITEMSIZE_2
+      0x0, // VGT_GS_VERT_ITEMSIZE_3
+      0x0, // VGT_TF_PARAM
+      0x0, // DB_ALPHA_TO_MASK
+      0x0, // VGT_DISPATCH_DRAW_INDEX
+      0x0, // PA_SU_POLY_OFFSET_DB_FMT_CNTL
+      0x0, // PA_SU_POLY_OFFSET_CLAMP
+      0x0, // PA_SU_POLY_OFFSET_FRONT_SCALE
+      0x0, // PA_SU_POLY_OFFSET_FRONT_OFFSET
+      0x0, // PA_SU_POLY_OFFSET_BACK_SCALE
+      0x0, // PA_SU_POLY_OFFSET_BACK_OFFSET
+      0x0, // VGT_GS_INSTANCE_CNT
+      0x0, // VGT_STRMOUT_CONFIG
+      0x0  // VGT_STRMOUT_BUFFER_CONFIG
    };
    static const uint32_t PaScCentroidPriority0Nv10[] = {
-      0x0       , // PA_SC_CENTROID_PRIORITY_0
-      0x0       , // PA_SC_CENTROID_PRIORITY_1
-      0x1000    , // PA_SC_LINE_CNTL
-      0x0       , // PA_SC_AA_CONFIG
-      0x5       , // PA_SU_VTX_CNTL
+      0x0,        // PA_SC_CENTROID_PRIORITY_0
+      0x0,        // PA_SC_CENTROID_PRIORITY_1
+      0x1000,     // PA_SC_LINE_CNTL
+      0x0,        // PA_SC_AA_CONFIG
+      0x5,        // PA_SU_VTX_CNTL
       0x3f800000, // PA_CL_GB_VERT_CLIP_ADJ
       0x3f800000, // PA_CL_GB_VERT_DISC_ADJ
       0x3f800000, // PA_CL_GB_HORZ_CLIP_ADJ
       0x3f800000, // PA_CL_GB_HORZ_DISC_ADJ
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3
       0xffffffff, // PA_SC_AA_MASK_X0Y0_X1Y0
       0xffffffff, // PA_SC_AA_MASK_X0Y1_X1Y1
-      0x0       , // PA_SC_SHADER_CONTROL
-      0x3       , // PA_SC_BINNER_CNTL_0
-      0x0       , // PA_SC_BINNER_CNTL_1
-      0x100000  , // PA_SC_CONSERVATIVE_RASTERIZATION_CNTL
-      0x0       , // PA_SC_NGG_MODE_CNTL
-      0x0       , //
-      0x1e      , // VGT_VERTEX_REUSE_BLOCK_CNTL
-      0x20      , // VGT_OUT_DEALLOC_CNTL
-      0x0       , // CB_COLOR0_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , // CB_COLOR0_VIEW
-      0x0       , // CB_COLOR0_INFO
-      0x0       , // CB_COLOR0_ATTRIB
-      0x0       , // CB_COLOR0_DCC_CONTROL
-      0x0       , // CB_COLOR0_CMASK
-      0x0       , //
-      0x0       , // CB_COLOR0_FMASK
-      0x0       , //
-      0x0       , // CB_COLOR0_CLEAR_WORD0
-      0x0       , // CB_COLOR0_CLEAR_WORD1
-      0x0       , // CB_COLOR0_DCC_BASE
-      0x0       , //
-      0x0       , // CB_COLOR1_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , // CB_COLOR1_VIEW
-      0x0       , // CB_COLOR1_INFO
-      0x0       , // CB_COLOR1_ATTRIB
-      0x0       , // CB_COLOR1_DCC_CONTROL
-      0x0       , // CB_COLOR1_CMASK
-      0x0       , //
-      0x0       , // CB_COLOR1_FMASK
-      0x0       , //
-      0x0       , // CB_COLOR1_CLEAR_WORD0
-      0x0       , // CB_COLOR1_CLEAR_WORD1
-      0x0       , // CB_COLOR1_DCC_BASE
-      0x0       , //
-      0x0       , // CB_COLOR2_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , // CB_COLOR2_VIEW
-      0x0       , // CB_COLOR2_INFO
-      0x0       , // CB_COLOR2_ATTRIB
-      0x0       , // CB_COLOR2_DCC_CONTROL
-      0x0       , // CB_COLOR2_CMASK
-      0x0       , //
-      0x0       , // CB_COLOR2_FMASK
-      0x0       , //
-      0x0       , // CB_COLOR2_CLEAR_WORD0
-      0x0       , // CB_COLOR2_CLEAR_WORD1
-      0x0       , // CB_COLOR2_DCC_BASE
-      0x0       , //
-      0x0       , // CB_COLOR3_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , // CB_COLOR3_VIEW
-      0x0       , // CB_COLOR3_INFO
-      0x0       , // CB_COLOR3_ATTRIB
-      0x0       , // CB_COLOR3_DCC_CONTROL
-      0x0       , // CB_COLOR3_CMASK
-      0x0       , //
-      0x0       , // CB_COLOR3_FMASK
-      0x0       , //
-      0x0       , // CB_COLOR3_CLEAR_WORD0
-      0x0       , // CB_COLOR3_CLEAR_WORD1
-      0x0       , // CB_COLOR3_DCC_BASE
-      0x0       , //
-      0x0       , // CB_COLOR4_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , // CB_COLOR4_VIEW
-      0x0       , // CB_COLOR4_INFO
-      0x0       , // CB_COLOR4_ATTRIB
-      0x0       , // CB_COLOR4_DCC_CONTROL
-      0x0       , // CB_COLOR4_CMASK
-      0x0       , //
-      0x0       , // CB_COLOR4_FMASK
-      0x0       , //
-      0x0       , // CB_COLOR4_CLEAR_WORD0
-      0x0       , // CB_COLOR4_CLEAR_WORD1
-      0x0       , // CB_COLOR4_DCC_BASE
-      0x0       , //
-      0x0       , // CB_COLOR5_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , // CB_COLOR5_VIEW
-      0x0       , // CB_COLOR5_INFO
-      0x0       , // CB_COLOR5_ATTRIB
-      0x0       , // CB_COLOR5_DCC_CONTROL
-      0x0       , // CB_COLOR5_CMASK
-      0x0       , //
-      0x0       , // CB_COLOR5_FMASK
-      0x0       , //
-      0x0       , // CB_COLOR5_CLEAR_WORD0
-      0x0       , // CB_COLOR5_CLEAR_WORD1
-      0x0       , // CB_COLOR5_DCC_BASE
-      0x0       , //
-      0x0       , // CB_COLOR6_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , // CB_COLOR6_VIEW
-      0x0       , // CB_COLOR6_INFO
-      0x0       , // CB_COLOR6_ATTRIB
-      0x0       , // CB_COLOR6_DCC_CONTROL
-      0x0       , // CB_COLOR6_CMASK
-      0x0       , //
-      0x0       , // CB_COLOR6_FMASK
-      0x0       , //
-      0x0       , // CB_COLOR6_CLEAR_WORD0
-      0x0       , // CB_COLOR6_CLEAR_WORD1
-      0x0       , // CB_COLOR6_DCC_BASE
-      0x0       , //
-      0x0       , // CB_COLOR7_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , // CB_COLOR7_VIEW
-      0x0       , // CB_COLOR7_INFO
-      0x0       , // CB_COLOR7_ATTRIB
-      0x0       , // CB_COLOR7_DCC_CONTROL
-      0x0       , // CB_COLOR7_CMASK
-      0x0       , //
-      0x0       , // CB_COLOR7_FMASK
-      0x0       , //
-      0x0       , // CB_COLOR7_CLEAR_WORD0
-      0x0       , // CB_COLOR7_CLEAR_WORD1
-      0x0       , // CB_COLOR7_DCC_BASE
-      0x0       , //
-      0x0       , // CB_COLOR0_BASE_EXT
-      0x0       , // CB_COLOR1_BASE_EXT
-      0x0       , // CB_COLOR2_BASE_EXT
-      0x0       , // CB_COLOR3_BASE_EXT
-      0x0       , // CB_COLOR4_BASE_EXT
-      0x0       , // CB_COLOR5_BASE_EXT
-      0x0       , // CB_COLOR6_BASE_EXT
-      0x0       , // CB_COLOR7_BASE_EXT
-      0x0       , // CB_COLOR0_CMASK_BASE_EXT
-      0x0       , // CB_COLOR1_CMASK_BASE_EXT
-      0x0       , // CB_COLOR2_CMASK_BASE_EXT
-      0x0       , // CB_COLOR3_CMASK_BASE_EXT
-      0x0       , // CB_COLOR4_CMASK_BASE_EXT
-      0x0       , // CB_COLOR5_CMASK_BASE_EXT
-      0x0       , // CB_COLOR6_CMASK_BASE_EXT
-      0x0       , // CB_COLOR7_CMASK_BASE_EXT
-      0x0       , // CB_COLOR0_FMASK_BASE_EXT
-      0x0       , // CB_COLOR1_FMASK_BASE_EXT
-      0x0       , // CB_COLOR2_FMASK_BASE_EXT
-      0x0       , // CB_COLOR3_FMASK_BASE_EXT
-      0x0       , // CB_COLOR4_FMASK_BASE_EXT
-      0x0       , // CB_COLOR5_FMASK_BASE_EXT
-      0x0       , // CB_COLOR6_FMASK_BASE_EXT
-      0x0       , // CB_COLOR7_FMASK_BASE_EXT
-      0x0       , // CB_COLOR0_DCC_BASE_EXT
-      0x0       , // CB_COLOR1_DCC_BASE_EXT
-      0x0       , // CB_COLOR2_DCC_BASE_EXT
-      0x0       , // CB_COLOR3_DCC_BASE_EXT
-      0x0       , // CB_COLOR4_DCC_BASE_EXT
-      0x0       , // CB_COLOR5_DCC_BASE_EXT
-      0x0       , // CB_COLOR6_DCC_BASE_EXT
-      0x0       , // CB_COLOR7_DCC_BASE_EXT
-      0x0       , // CB_COLOR0_ATTRIB2
-      0x0       , // CB_COLOR1_ATTRIB2
-      0x0       , // CB_COLOR2_ATTRIB2
-      0x0       , // CB_COLOR3_ATTRIB2
-      0x0       , // CB_COLOR4_ATTRIB2
-      0x0       , // CB_COLOR5_ATTRIB2
-      0x0       , // CB_COLOR6_ATTRIB2
-      0x0       , // CB_COLOR7_ATTRIB2
-      0x0       , // CB_COLOR0_ATTRIB3
-      0x0       , // CB_COLOR1_ATTRIB3
-      0x0       , // CB_COLOR2_ATTRIB3
-      0x0       , // CB_COLOR3_ATTRIB3
-      0x0       , // CB_COLOR4_ATTRIB3
-      0x0       , // CB_COLOR5_ATTRIB3
-      0x0       , // CB_COLOR6_ATTRIB3
+      0x0,        // PA_SC_SHADER_CONTROL
+      0x3,        // PA_SC_BINNER_CNTL_0
+      0x0,        // PA_SC_BINNER_CNTL_1
+      0x100000,   // PA_SC_CONSERVATIVE_RASTERIZATION_CNTL
+      0x0,        // PA_SC_NGG_MODE_CNTL
+      0x0,        //
+      0x1e,       // VGT_VERTEX_REUSE_BLOCK_CNTL
+      0x20,       // VGT_OUT_DEALLOC_CNTL
+      0x0,        // CB_COLOR0_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        // CB_COLOR0_VIEW
+      0x0,        // CB_COLOR0_INFO
+      0x0,        // CB_COLOR0_ATTRIB
+      0x0,        // CB_COLOR0_DCC_CONTROL
+      0x0,        // CB_COLOR0_CMASK
+      0x0,        //
+      0x0,        // CB_COLOR0_FMASK
+      0x0,        //
+      0x0,        // CB_COLOR0_CLEAR_WORD0
+      0x0,        // CB_COLOR0_CLEAR_WORD1
+      0x0,        // CB_COLOR0_DCC_BASE
+      0x0,        //
+      0x0,        // CB_COLOR1_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        // CB_COLOR1_VIEW
+      0x0,        // CB_COLOR1_INFO
+      0x0,        // CB_COLOR1_ATTRIB
+      0x0,        // CB_COLOR1_DCC_CONTROL
+      0x0,        // CB_COLOR1_CMASK
+      0x0,        //
+      0x0,        // CB_COLOR1_FMASK
+      0x0,        //
+      0x0,        // CB_COLOR1_CLEAR_WORD0
+      0x0,        // CB_COLOR1_CLEAR_WORD1
+      0x0,        // CB_COLOR1_DCC_BASE
+      0x0,        //
+      0x0,        // CB_COLOR2_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        // CB_COLOR2_VIEW
+      0x0,        // CB_COLOR2_INFO
+      0x0,        // CB_COLOR2_ATTRIB
+      0x0,        // CB_COLOR2_DCC_CONTROL
+      0x0,        // CB_COLOR2_CMASK
+      0x0,        //
+      0x0,        // CB_COLOR2_FMASK
+      0x0,        //
+      0x0,        // CB_COLOR2_CLEAR_WORD0
+      0x0,        // CB_COLOR2_CLEAR_WORD1
+      0x0,        // CB_COLOR2_DCC_BASE
+      0x0,        //
+      0x0,        // CB_COLOR3_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        // CB_COLOR3_VIEW
+      0x0,        // CB_COLOR3_INFO
+      0x0,        // CB_COLOR3_ATTRIB
+      0x0,        // CB_COLOR3_DCC_CONTROL
+      0x0,        // CB_COLOR3_CMASK
+      0x0,        //
+      0x0,        // CB_COLOR3_FMASK
+      0x0,        //
+      0x0,        // CB_COLOR3_CLEAR_WORD0
+      0x0,        // CB_COLOR3_CLEAR_WORD1
+      0x0,        // CB_COLOR3_DCC_BASE
+      0x0,        //
+      0x0,        // CB_COLOR4_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        // CB_COLOR4_VIEW
+      0x0,        // CB_COLOR4_INFO
+      0x0,        // CB_COLOR4_ATTRIB
+      0x0,        // CB_COLOR4_DCC_CONTROL
+      0x0,        // CB_COLOR4_CMASK
+      0x0,        //
+      0x0,        // CB_COLOR4_FMASK
+      0x0,        //
+      0x0,        // CB_COLOR4_CLEAR_WORD0
+      0x0,        // CB_COLOR4_CLEAR_WORD1
+      0x0,        // CB_COLOR4_DCC_BASE
+      0x0,        //
+      0x0,        // CB_COLOR5_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        // CB_COLOR5_VIEW
+      0x0,        // CB_COLOR5_INFO
+      0x0,        // CB_COLOR5_ATTRIB
+      0x0,        // CB_COLOR5_DCC_CONTROL
+      0x0,        // CB_COLOR5_CMASK
+      0x0,        //
+      0x0,        // CB_COLOR5_FMASK
+      0x0,        //
+      0x0,        // CB_COLOR5_CLEAR_WORD0
+      0x0,        // CB_COLOR5_CLEAR_WORD1
+      0x0,        // CB_COLOR5_DCC_BASE
+      0x0,        //
+      0x0,        // CB_COLOR6_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        // CB_COLOR6_VIEW
+      0x0,        // CB_COLOR6_INFO
+      0x0,        // CB_COLOR6_ATTRIB
+      0x0,        // CB_COLOR6_DCC_CONTROL
+      0x0,        // CB_COLOR6_CMASK
+      0x0,        //
+      0x0,        // CB_COLOR6_FMASK
+      0x0,        //
+      0x0,        // CB_COLOR6_CLEAR_WORD0
+      0x0,        // CB_COLOR6_CLEAR_WORD1
+      0x0,        // CB_COLOR6_DCC_BASE
+      0x0,        //
+      0x0,        // CB_COLOR7_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        // CB_COLOR7_VIEW
+      0x0,        // CB_COLOR7_INFO
+      0x0,        // CB_COLOR7_ATTRIB
+      0x0,        // CB_COLOR7_DCC_CONTROL
+      0x0,        // CB_COLOR7_CMASK
+      0x0,        //
+      0x0,        // CB_COLOR7_FMASK
+      0x0,        //
+      0x0,        // CB_COLOR7_CLEAR_WORD0
+      0x0,        // CB_COLOR7_CLEAR_WORD1
+      0x0,        // CB_COLOR7_DCC_BASE
+      0x0,        //
+      0x0,        // CB_COLOR0_BASE_EXT
+      0x0,        // CB_COLOR1_BASE_EXT
+      0x0,        // CB_COLOR2_BASE_EXT
+      0x0,        // CB_COLOR3_BASE_EXT
+      0x0,        // CB_COLOR4_BASE_EXT
+      0x0,        // CB_COLOR5_BASE_EXT
+      0x0,        // CB_COLOR6_BASE_EXT
+      0x0,        // CB_COLOR7_BASE_EXT
+      0x0,        // CB_COLOR0_CMASK_BASE_EXT
+      0x0,        // CB_COLOR1_CMASK_BASE_EXT
+      0x0,        // CB_COLOR2_CMASK_BASE_EXT
+      0x0,        // CB_COLOR3_CMASK_BASE_EXT
+      0x0,        // CB_COLOR4_CMASK_BASE_EXT
+      0x0,        // CB_COLOR5_CMASK_BASE_EXT
+      0x0,        // CB_COLOR6_CMASK_BASE_EXT
+      0x0,        // CB_COLOR7_CMASK_BASE_EXT
+      0x0,        // CB_COLOR0_FMASK_BASE_EXT
+      0x0,        // CB_COLOR1_FMASK_BASE_EXT
+      0x0,        // CB_COLOR2_FMASK_BASE_EXT
+      0x0,        // CB_COLOR3_FMASK_BASE_EXT
+      0x0,        // CB_COLOR4_FMASK_BASE_EXT
+      0x0,        // CB_COLOR5_FMASK_BASE_EXT
+      0x0,        // CB_COLOR6_FMASK_BASE_EXT
+      0x0,        // CB_COLOR7_FMASK_BASE_EXT
+      0x0,        // CB_COLOR0_DCC_BASE_EXT
+      0x0,        // CB_COLOR1_DCC_BASE_EXT
+      0x0,        // CB_COLOR2_DCC_BASE_EXT
+      0x0,        // CB_COLOR3_DCC_BASE_EXT
+      0x0,        // CB_COLOR4_DCC_BASE_EXT
+      0x0,        // CB_COLOR5_DCC_BASE_EXT
+      0x0,        // CB_COLOR6_DCC_BASE_EXT
+      0x0,        // CB_COLOR7_DCC_BASE_EXT
+      0x0,        // CB_COLOR0_ATTRIB2
+      0x0,        // CB_COLOR1_ATTRIB2
+      0x0,        // CB_COLOR2_ATTRIB2
+      0x0,        // CB_COLOR3_ATTRIB2
+      0x0,        // CB_COLOR4_ATTRIB2
+      0x0,        // CB_COLOR5_ATTRIB2
+      0x0,        // CB_COLOR6_ATTRIB2
+      0x0,        // CB_COLOR7_ATTRIB2
+      0x0,        // CB_COLOR0_ATTRIB3
+      0x0,        // CB_COLOR1_ATTRIB3
+      0x0,        // CB_COLOR2_ATTRIB3
+      0x0,        // CB_COLOR3_ATTRIB3
+      0x0,        // CB_COLOR4_ATTRIB3
+      0x0,        // CB_COLOR5_ATTRIB3
+      0x0,        // CB_COLOR6_ATTRIB3
       0x0         // CB_COLOR7_ATTRIB3
    };
 
    set_context_reg_seq_array(cs, R_028000_DB_RENDER_CONTROL, SET(DbRenderControlNv10));
    set_context_reg_seq_array(cs, R_0281E8_COHER_DEST_BASE_HI_0, SET(CoherDestBaseHi0Nv10));
-   set_context_reg_seq_array(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, SET(VgtMultiPrimIbResetIndxNv10));
+   set_context_reg_seq_array(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
+                             SET(VgtMultiPrimIbResetIndxNv10));
    set_context_reg_seq_array(cs, R_028644_SPI_PS_INPUT_CNTL_0, SET(SpiPsInputCntl0Nv10));
    set_context_reg_seq_array(cs, R_028754_SX_PS_DOWNCONVERT, SET(SxPsDownconvertNv10));
-   set_context_reg_seq_array(cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, SET(GeMaxOutputPerSubgroupNv10));
+   set_context_reg_seq_array(cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
+                             SET(GeMaxOutputPerSubgroupNv10));
    set_context_reg_seq_array(cs, R_02882C_PA_SU_PRIM_FILTER_CNTL, SET(PaSuPrimFilterCntlNv10));
    set_context_reg_seq_array(cs, R_028A00_PA_SU_POINT_SIZE, SET(PaSuPointSizeNv10));
    set_context_reg_seq_array(cs, R_028A18_VGT_HOS_MAX_TESS_LEVEL, SET(VgtHosMaxTessLevelNv10));
@@ -2207,7 +2222,8 @@ static void gfx10_emulate_clear_state(struct radeon_cmdbuf *cs, unsigned num_reg
    set_context_reg_seq_array(cs, R_028A84_VGT_PRIMITIVEID_EN, SET(VgtPrimitiveidEnNv10));
    set_context_reg_seq_array(cs, R_028A8C_VGT_PRIMITIVEID_RESET, SET(VgtPrimitiveidResetNv10));
    set_context_reg_seq_array(cs, R_028A98_VGT_DRAW_PAYLOAD_CNTL, SET(VgtDrawPayloadCntlNv10));
-   set_context_reg_seq_array(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, SET(PaScCentroidPriority0Nv10));
+   set_context_reg_seq_array(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0,
+                             SET(PaScCentroidPriority0Nv10));
 
    for (unsigned i = 0; i < num_reg_pairs; i++)
       set_context_reg_seq_array(cs, reg_offsets[i], 1, &reg_values[i]);
@@ -2222,68 +2238,68 @@ static void gfx103_emulate_clear_state(struct radeon_cmdbuf *cs, unsigned num_re
                                        set_context_reg_seq_array_fn set_context_reg_seq_array)
 {
    static const uint32_t DbRenderControlGfx103[] = {
-      0x0       , // DB_RENDER_CONTROL
-      0x0       , // DB_COUNT_CONTROL
-      0x0       , // DB_DEPTH_VIEW
-      0x0       , // DB_RENDER_OVERRIDE
-      0x0       , // DB_RENDER_OVERRIDE2
-      0x0       , // DB_HTILE_DATA_BASE
-      0x0       , //
-      0x0       , // DB_DEPTH_SIZE_XY
-      0x0       , // DB_DEPTH_BOUNDS_MIN
-      0x0       , // DB_DEPTH_BOUNDS_MAX
-      0x0       , // DB_STENCIL_CLEAR
-      0x0       , // DB_DEPTH_CLEAR
-      0x0       , // PA_SC_SCREEN_SCISSOR_TL
+      0x0,        // DB_RENDER_CONTROL
+      0x0,        // DB_COUNT_CONTROL
+      0x0,        // DB_DEPTH_VIEW
+      0x0,        // DB_RENDER_OVERRIDE
+      0x0,        // DB_RENDER_OVERRIDE2
+      0x0,        // DB_HTILE_DATA_BASE
+      0x0,        //
+      0x0,        // DB_DEPTH_SIZE_XY
+      0x0,        // DB_DEPTH_BOUNDS_MIN
+      0x0,        // DB_DEPTH_BOUNDS_MAX
+      0x0,        // DB_STENCIL_CLEAR
+      0x0,        // DB_DEPTH_CLEAR
+      0x0,        // PA_SC_SCREEN_SCISSOR_TL
       0x40004000, // PA_SC_SCREEN_SCISSOR_BR
-      0x0       , // DB_DFSM_CONTROL
-      0x0       , // DB_RESERVED_REG_2
-      0x0       , // DB_Z_INFO
-      0x0       , // DB_STENCIL_INFO
-      0x0       , // DB_Z_READ_BASE
-      0x0       , // DB_STENCIL_READ_BASE
-      0x0       , // DB_Z_WRITE_BASE
-      0x0       , // DB_STENCIL_WRITE_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , // DB_Z_READ_BASE_HI
-      0x0       , // DB_STENCIL_READ_BASE_HI
-      0x0       , // DB_Z_WRITE_BASE_HI
-      0x0       , // DB_STENCIL_WRITE_BASE_HI
-      0x0       , // DB_HTILE_DATA_BASE_HI
-      0x0       , // DB_RMI_L2_CACHE_CONTROL
-      0x0       , // TA_BC_BASE_ADDR
+      0x0,        // DB_DFSM_CONTROL
+      0x0,        // DB_RESERVED_REG_2
+      0x0,        // DB_Z_INFO
+      0x0,        // DB_STENCIL_INFO
+      0x0,        // DB_Z_READ_BASE
+      0x0,        // DB_STENCIL_READ_BASE
+      0x0,        // DB_Z_WRITE_BASE
+      0x0,        // DB_STENCIL_WRITE_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        //
+      0x0,        //
+      0x0,        // DB_Z_READ_BASE_HI
+      0x0,        // DB_STENCIL_READ_BASE_HI
+      0x0,        // DB_Z_WRITE_BASE_HI
+      0x0,        // DB_STENCIL_WRITE_BASE_HI
+      0x0,        // DB_HTILE_DATA_BASE_HI
+      0x0,        // DB_RMI_L2_CACHE_CONTROL
+      0x0,        // TA_BC_BASE_ADDR
       0x0         // TA_BC_BASE_ADDR_HI
    };
    static const uint32_t CoherDestBaseHi0Gfx103[] = {
-      0x0       , // COHER_DEST_BASE_HI_0
-      0x0       , // COHER_DEST_BASE_HI_1
-      0x0       , // COHER_DEST_BASE_HI_2
-      0x0       , // COHER_DEST_BASE_HI_3
-      0x0       , // COHER_DEST_BASE_2
-      0x0       , // COHER_DEST_BASE_3
-      0x0       , // PA_SC_WINDOW_OFFSET
+      0x0,        // COHER_DEST_BASE_HI_0
+      0x0,        // COHER_DEST_BASE_HI_1
+      0x0,        // COHER_DEST_BASE_HI_2
+      0x0,        // COHER_DEST_BASE_HI_3
+      0x0,        // COHER_DEST_BASE_2
+      0x0,        // COHER_DEST_BASE_3
+      0x0,        // PA_SC_WINDOW_OFFSET
       0x80000000, // PA_SC_WINDOW_SCISSOR_TL
       0x40004000, // PA_SC_WINDOW_SCISSOR_BR
-      0xffff    , // PA_SC_CLIPRECT_RULE
-      0x0       , // PA_SC_CLIPRECT_0_TL
+      0xffff,     // PA_SC_CLIPRECT_RULE
+      0x0,        // PA_SC_CLIPRECT_0_TL
       0x40004000, // PA_SC_CLIPRECT_0_BR
-      0x0       , // PA_SC_CLIPRECT_1_TL
+      0x0,        // PA_SC_CLIPRECT_1_TL
       0x40004000, // PA_SC_CLIPRECT_1_BR
-      0x0       , // PA_SC_CLIPRECT_2_TL
+      0x0,        // PA_SC_CLIPRECT_2_TL
       0x40004000, // PA_SC_CLIPRECT_2_BR
-      0x0       , // PA_SC_CLIPRECT_3_TL
+      0x0,        // PA_SC_CLIPRECT_3_TL
       0x40004000, // PA_SC_CLIPRECT_3_BR
       0xaa99aaaa, // PA_SC_EDGERULE
-      0x0       , // PA_SU_HARDWARE_SCREEN_OFFSET
+      0x0,        // PA_SU_HARDWARE_SCREEN_OFFSET
       0xffffffff, // CB_TARGET_MASK
       0xffffffff, // CB_SHADER_MASK
       0x80000000, // PA_SC_GENERIC_SCISSOR_TL
       0x40004000, // PA_SC_GENERIC_SCISSOR_BR
-      0x0       , // COHER_DEST_BASE_0
-      0x0       , // COHER_DEST_BASE_1
+      0x0,        // COHER_DEST_BASE_0
+      0x0,        // COHER_DEST_BASE_1
       0x80000000, // PA_SC_VPORT_SCISSOR_0_TL
       0x40004000, // PA_SC_VPORT_SCISSOR_0_BR
       0x80000000, // PA_SC_VPORT_SCISSOR_1_TL
@@ -2316,585 +2332,588 @@ static void gfx103_emulate_clear_state(struct radeon_cmdbuf *cs, unsigned num_re
       0x40004000, // PA_SC_VPORT_SCISSOR_14_BR
       0x80000000, // PA_SC_VPORT_SCISSOR_15_TL
       0x40004000, // PA_SC_VPORT_SCISSOR_15_BR
-      0x0       , // PA_SC_VPORT_ZMIN_0
+      0x0,        // PA_SC_VPORT_ZMIN_0
       0x3f800000, // PA_SC_VPORT_ZMAX_0
-      0x0       , // PA_SC_VPORT_ZMIN_1
+      0x0,        // PA_SC_VPORT_ZMIN_1
       0x3f800000, // PA_SC_VPORT_ZMAX_1
-      0x0       , // PA_SC_VPORT_ZMIN_2
+      0x0,        // PA_SC_VPORT_ZMIN_2
       0x3f800000, // PA_SC_VPORT_ZMAX_2
-      0x0       , // PA_SC_VPORT_ZMIN_3
+      0x0,        // PA_SC_VPORT_ZMIN_3
       0x3f800000, // PA_SC_VPORT_ZMAX_3
-      0x0       , // PA_SC_VPORT_ZMIN_4
+      0x0,        // PA_SC_VPORT_ZMIN_4
       0x3f800000, // PA_SC_VPORT_ZMAX_4
-      0x0       , // PA_SC_VPORT_ZMIN_5
+      0x0,        // PA_SC_VPORT_ZMIN_5
       0x3f800000, // PA_SC_VPORT_ZMAX_5
-      0x0       , // PA_SC_VPORT_ZMIN_6
+      0x0,        // PA_SC_VPORT_ZMIN_6
       0x3f800000, // PA_SC_VPORT_ZMAX_6
-      0x0       , // PA_SC_VPORT_ZMIN_7
+      0x0,        // PA_SC_VPORT_ZMIN_7
       0x3f800000, // PA_SC_VPORT_ZMAX_7
-      0x0       , // PA_SC_VPORT_ZMIN_8
+      0x0,        // PA_SC_VPORT_ZMIN_8
       0x3f800000, // PA_SC_VPORT_ZMAX_8
-      0x0       , // PA_SC_VPORT_ZMIN_9
+      0x0,        // PA_SC_VPORT_ZMIN_9
       0x3f800000, // PA_SC_VPORT_ZMAX_9
-      0x0       , // PA_SC_VPORT_ZMIN_10
+      0x0,        // PA_SC_VPORT_ZMIN_10
       0x3f800000, // PA_SC_VPORT_ZMAX_10
-      0x0       , // PA_SC_VPORT_ZMIN_11
+      0x0,        // PA_SC_VPORT_ZMIN_11
       0x3f800000, // PA_SC_VPORT_ZMAX_11
-      0x0       , // PA_SC_VPORT_ZMIN_12
+      0x0,        // PA_SC_VPORT_ZMIN_12
       0x3f800000, // PA_SC_VPORT_ZMAX_12
-      0x0       , // PA_SC_VPORT_ZMIN_13
+      0x0,        // PA_SC_VPORT_ZMIN_13
       0x3f800000, // PA_SC_VPORT_ZMAX_13
-      0x0       , // PA_SC_VPORT_ZMIN_14
+      0x0,        // PA_SC_VPORT_ZMIN_14
       0x3f800000, // PA_SC_VPORT_ZMAX_14
-      0x0       , // PA_SC_VPORT_ZMIN_15
+      0x0,        // PA_SC_VPORT_ZMIN_15
       0x3f800000, // PA_SC_VPORT_ZMAX_15
-      0x0       , // PA_SC_RASTER_CONFIG
-      0x0       , // PA_SC_RASTER_CONFIG_1
-      0x0       , //
+      0x0,        // PA_SC_RASTER_CONFIG
+      0x0,        // PA_SC_RASTER_CONFIG_1
+      0x0,        //
       0x0         // PA_SC_TILE_STEERING_OVERRIDE
    };
    static const uint32_t VgtMultiPrimIbResetIndxGfx103[] = {
-      0x0       , // VGT_MULTI_PRIM_IB_RESET_INDX
-      0x0       , // CB_RMI_GL2_CACHE_CONTROL
-      0x0       , // CB_BLEND_RED
-      0x0       , // CB_BLEND_GREEN
-      0x0       , // CB_BLEND_BLUE
-      0x0       , // CB_BLEND_ALPHA
-      0x0       , // CB_DCC_CONTROL
-      0x0       , // CB_COVERAGE_OUT_CONTROL
-      0x0       , // DB_STENCIL_CONTROL
-      0x1000000 , // DB_STENCILREFMASK
-      0x1000000 , // DB_STENCILREFMASK_BF
-      0x0       , //
-      0x0       , // PA_CL_VPORT_XSCALE
-      0x0       , // PA_CL_VPORT_XOFFSET
-      0x0       , // PA_CL_VPORT_YSCALE
-      0x0       , // PA_CL_VPORT_YOFFSET
-      0x0       , // PA_CL_VPORT_ZSCALE
-      0x0       , // PA_CL_VPORT_ZOFFSET
-      0x0       , // PA_CL_VPORT_XSCALE_1
-      0x0       , // PA_CL_VPORT_XOFFSET_1
-      0x0       , // PA_CL_VPORT_YSCALE_1
-      0x0       , // PA_CL_VPORT_YOFFSET_1
-      0x0       , // PA_CL_VPORT_ZSCALE_1
-      0x0       , // PA_CL_VPORT_ZOFFSET_1
-      0x0       , // PA_CL_VPORT_XSCALE_2
-      0x0       , // PA_CL_VPORT_XOFFSET_2
-      0x0       , // PA_CL_VPORT_YSCALE_2
-      0x0       , // PA_CL_VPORT_YOFFSET_2
-      0x0       , // PA_CL_VPORT_ZSCALE_2
-      0x0       , // PA_CL_VPORT_ZOFFSET_2
-      0x0       , // PA_CL_VPORT_XSCALE_3
-      0x0       , // PA_CL_VPORT_XOFFSET_3
-      0x0       , // PA_CL_VPORT_YSCALE_3
-      0x0       , // PA_CL_VPORT_YOFFSET_3
-      0x0       , // PA_CL_VPORT_ZSCALE_3
-      0x0       , // PA_CL_VPORT_ZOFFSET_3
-      0x0       , // PA_CL_VPORT_XSCALE_4
-      0x0       , // PA_CL_VPORT_XOFFSET_4
-      0x0       , // PA_CL_VPORT_YSCALE_4
-      0x0       , // PA_CL_VPORT_YOFFSET_4
-      0x0       , // PA_CL_VPORT_ZSCALE_4
-      0x0       , // PA_CL_VPORT_ZOFFSET_4
-      0x0       , // PA_CL_VPORT_XSCALE_5
-      0x0       , // PA_CL_VPORT_XOFFSET_5
-      0x0       , // PA_CL_VPORT_YSCALE_5
-      0x0       , // PA_CL_VPORT_YOFFSET_5
-      0x0       , // PA_CL_VPORT_ZSCALE_5
-      0x0       , // PA_CL_VPORT_ZOFFSET_5
-      0x0       , // PA_CL_VPORT_XSCALE_6
-      0x0       , // PA_CL_VPORT_XOFFSET_6
-      0x0       , // PA_CL_VPORT_YSCALE_6
-      0x0       , // PA_CL_VPORT_YOFFSET_6
-      0x0       , // PA_CL_VPORT_ZSCALE_6
-      0x0       , // PA_CL_VPORT_ZOFFSET_6
-      0x0       , // PA_CL_VPORT_XSCALE_7
-      0x0       , // PA_CL_VPORT_XOFFSET_7
-      0x0       , // PA_CL_VPORT_YSCALE_7
-      0x0       , // PA_CL_VPORT_YOFFSET_7
-      0x0       , // PA_CL_VPORT_ZSCALE_7
-      0x0       , // PA_CL_VPORT_ZOFFSET_7
-      0x0       , // PA_CL_VPORT_XSCALE_8
-      0x0       , // PA_CL_VPORT_XOFFSET_8
-      0x0       , // PA_CL_VPORT_YSCALE_8
-      0x0       , // PA_CL_VPORT_YOFFSET_8
-      0x0       , // PA_CL_VPORT_ZSCALE_8
-      0x0       , // PA_CL_VPORT_ZOFFSET_8
-      0x0       , // PA_CL_VPORT_XSCALE_9
-      0x0       , // PA_CL_VPORT_XOFFSET_9
-      0x0       , // PA_CL_VPORT_YSCALE_9
-      0x0       , // PA_CL_VPORT_YOFFSET_9
-      0x0       , // PA_CL_VPORT_ZSCALE_9
-      0x0       , // PA_CL_VPORT_ZOFFSET_9
-      0x0       , // PA_CL_VPORT_XSCALE_10
-      0x0       , // PA_CL_VPORT_XOFFSET_10
-      0x0       , // PA_CL_VPORT_YSCALE_10
-      0x0       , // PA_CL_VPORT_YOFFSET_10
-      0x0       , // PA_CL_VPORT_ZSCALE_10
-      0x0       , // PA_CL_VPORT_ZOFFSET_10
-      0x0       , // PA_CL_VPORT_XSCALE_11
-      0x0       , // PA_CL_VPORT_XOFFSET_11
-      0x0       , // PA_CL_VPORT_YSCALE_11
-      0x0       , // PA_CL_VPORT_YOFFSET_11
-      0x0       , // PA_CL_VPORT_ZSCALE_11
-      0x0       , // PA_CL_VPORT_ZOFFSET_11
-      0x0       , // PA_CL_VPORT_XSCALE_12
-      0x0       , // PA_CL_VPORT_XOFFSET_12
-      0x0       , // PA_CL_VPORT_YSCALE_12
-      0x0       , // PA_CL_VPORT_YOFFSET_12
-      0x0       , // PA_CL_VPORT_ZSCALE_12
-      0x0       , // PA_CL_VPORT_ZOFFSET_12
-      0x0       , // PA_CL_VPORT_XSCALE_13
-      0x0       , // PA_CL_VPORT_XOFFSET_13
-      0x0       , // PA_CL_VPORT_YSCALE_13
-      0x0       , // PA_CL_VPORT_YOFFSET_13
-      0x0       , // PA_CL_VPORT_ZSCALE_13
-      0x0       , // PA_CL_VPORT_ZOFFSET_13
-      0x0       , // PA_CL_VPORT_XSCALE_14
-      0x0       , // PA_CL_VPORT_XOFFSET_14
-      0x0       , // PA_CL_VPORT_YSCALE_14
-      0x0       , // PA_CL_VPORT_YOFFSET_14
-      0x0       , // PA_CL_VPORT_ZSCALE_14
-      0x0       , // PA_CL_VPORT_ZOFFSET_14
-      0x0       , // PA_CL_VPORT_XSCALE_15
-      0x0       , // PA_CL_VPORT_XOFFSET_15
-      0x0       , // PA_CL_VPORT_YSCALE_15
-      0x0       , // PA_CL_VPORT_YOFFSET_15
-      0x0       , // PA_CL_VPORT_ZSCALE_15
-      0x0       , // PA_CL_VPORT_ZOFFSET_15
-      0x0       , // PA_CL_UCP_0_X
-      0x0       , // PA_CL_UCP_0_Y
-      0x0       , // PA_CL_UCP_0_Z
-      0x0       , // PA_CL_UCP_0_W
-      0x0       , // PA_CL_UCP_1_X
-      0x0       , // PA_CL_UCP_1_Y
-      0x0       , // PA_CL_UCP_1_Z
-      0x0       , // PA_CL_UCP_1_W
-      0x0       , // PA_CL_UCP_2_X
-      0x0       , // PA_CL_UCP_2_Y
-      0x0       , // PA_CL_UCP_2_Z
-      0x0       , // PA_CL_UCP_2_W
-      0x0       , // PA_CL_UCP_3_X
-      0x0       , // PA_CL_UCP_3_Y
-      0x0       , // PA_CL_UCP_3_Z
-      0x0       , // PA_CL_UCP_3_W
-      0x0       , // PA_CL_UCP_4_X
-      0x0       , // PA_CL_UCP_4_Y
-      0x0       , // PA_CL_UCP_4_Z
-      0x0       , // PA_CL_UCP_4_W
-      0x0       , // PA_CL_UCP_5_X
-      0x0       , // PA_CL_UCP_5_Y
-      0x0       , // PA_CL_UCP_5_Z
-      0x0         // PA_CL_UCP_5_W
+      0x0,       // VGT_MULTI_PRIM_IB_RESET_INDX
+      0x0,       // CB_RMI_GL2_CACHE_CONTROL
+      0x0,       // CB_BLEND_RED
+      0x0,       // CB_BLEND_GREEN
+      0x0,       // CB_BLEND_BLUE
+      0x0,       // CB_BLEND_ALPHA
+      0x0,       // CB_DCC_CONTROL
+      0x0,       // CB_COVERAGE_OUT_CONTROL
+      0x0,       // DB_STENCIL_CONTROL
+      0x1000000, // DB_STENCILREFMASK
+      0x1000000, // DB_STENCILREFMASK_BF
+      0x0,       //
+      0x0,       // PA_CL_VPORT_XSCALE
+      0x0,       // PA_CL_VPORT_XOFFSET
+      0x0,       // PA_CL_VPORT_YSCALE
+      0x0,       // PA_CL_VPORT_YOFFSET
+      0x0,       // PA_CL_VPORT_ZSCALE
+      0x0,       // PA_CL_VPORT_ZOFFSET
+      0x0,       // PA_CL_VPORT_XSCALE_1
+      0x0,       // PA_CL_VPORT_XOFFSET_1
+      0x0,       // PA_CL_VPORT_YSCALE_1
+      0x0,       // PA_CL_VPORT_YOFFSET_1
+      0x0,       // PA_CL_VPORT_ZSCALE_1
+      0x0,       // PA_CL_VPORT_ZOFFSET_1
+      0x0,       // PA_CL_VPORT_XSCALE_2
+      0x0,       // PA_CL_VPORT_XOFFSET_2
+      0x0,       // PA_CL_VPORT_YSCALE_2
+      0x0,       // PA_CL_VPORT_YOFFSET_2
+      0x0,       // PA_CL_VPORT_ZSCALE_2
+      0x0,       // PA_CL_VPORT_ZOFFSET_2
+      0x0,       // PA_CL_VPORT_XSCALE_3
+      0x0,       // PA_CL_VPORT_XOFFSET_3
+      0x0,       // PA_CL_VPORT_YSCALE_3
+      0x0,       // PA_CL_VPORT_YOFFSET_3
+      0x0,       // PA_CL_VPORT_ZSCALE_3
+      0x0,       // PA_CL_VPORT_ZOFFSET_3
+      0x0,       // PA_CL_VPORT_XSCALE_4
+      0x0,       // PA_CL_VPORT_XOFFSET_4
+      0x0,       // PA_CL_VPORT_YSCALE_4
+      0x0,       // PA_CL_VPORT_YOFFSET_4
+      0x0,       // PA_CL_VPORT_ZSCALE_4
+      0x0,       // PA_CL_VPORT_ZOFFSET_4
+      0x0,       // PA_CL_VPORT_XSCALE_5
+      0x0,       // PA_CL_VPORT_XOFFSET_5
+      0x0,       // PA_CL_VPORT_YSCALE_5
+      0x0,       // PA_CL_VPORT_YOFFSET_5
+      0x0,       // PA_CL_VPORT_ZSCALE_5
+      0x0,       // PA_CL_VPORT_ZOFFSET_5
+      0x0,       // PA_CL_VPORT_XSCALE_6
+      0x0,       // PA_CL_VPORT_XOFFSET_6
+      0x0,       // PA_CL_VPORT_YSCALE_6
+      0x0,       // PA_CL_VPORT_YOFFSET_6
+      0x0,       // PA_CL_VPORT_ZSCALE_6
+      0x0,       // PA_CL_VPORT_ZOFFSET_6
+      0x0,       // PA_CL_VPORT_XSCALE_7
+      0x0,       // PA_CL_VPORT_XOFFSET_7
+      0x0,       // PA_CL_VPORT_YSCALE_7
+      0x0,       // PA_CL_VPORT_YOFFSET_7
+      0x0,       // PA_CL_VPORT_ZSCALE_7
+      0x0,       // PA_CL_VPORT_ZOFFSET_7
+      0x0,       // PA_CL_VPORT_XSCALE_8
+      0x0,       // PA_CL_VPORT_XOFFSET_8
+      0x0,       // PA_CL_VPORT_YSCALE_8
+      0x0,       // PA_CL_VPORT_YOFFSET_8
+      0x0,       // PA_CL_VPORT_ZSCALE_8
+      0x0,       // PA_CL_VPORT_ZOFFSET_8
+      0x0,       // PA_CL_VPORT_XSCALE_9
+      0x0,       // PA_CL_VPORT_XOFFSET_9
+      0x0,       // PA_CL_VPORT_YSCALE_9
+      0x0,       // PA_CL_VPORT_YOFFSET_9
+      0x0,       // PA_CL_VPORT_ZSCALE_9
+      0x0,       // PA_CL_VPORT_ZOFFSET_9
+      0x0,       // PA_CL_VPORT_XSCALE_10
+      0x0,       // PA_CL_VPORT_XOFFSET_10
+      0x0,       // PA_CL_VPORT_YSCALE_10
+      0x0,       // PA_CL_VPORT_YOFFSET_10
+      0x0,       // PA_CL_VPORT_ZSCALE_10
+      0x0,       // PA_CL_VPORT_ZOFFSET_10
+      0x0,       // PA_CL_VPORT_XSCALE_11
+      0x0,       // PA_CL_VPORT_XOFFSET_11
+      0x0,       // PA_CL_VPORT_YSCALE_11
+      0x0,       // PA_CL_VPORT_YOFFSET_11
+      0x0,       // PA_CL_VPORT_ZSCALE_11
+      0x0,       // PA_CL_VPORT_ZOFFSET_11
+      0x0,       // PA_CL_VPORT_XSCALE_12
+      0x0,       // PA_CL_VPORT_XOFFSET_12
+      0x0,       // PA_CL_VPORT_YSCALE_12
+      0x0,       // PA_CL_VPORT_YOFFSET_12
+      0x0,       // PA_CL_VPORT_ZSCALE_12
+      0x0,       // PA_CL_VPORT_ZOFFSET_12
+      0x0,       // PA_CL_VPORT_XSCALE_13
+      0x0,       // PA_CL_VPORT_XOFFSET_13
+      0x0,       // PA_CL_VPORT_YSCALE_13
+      0x0,       // PA_CL_VPORT_YOFFSET_13
+      0x0,       // PA_CL_VPORT_ZSCALE_13
+      0x0,       // PA_CL_VPORT_ZOFFSET_13
+      0x0,       // PA_CL_VPORT_XSCALE_14
+      0x0,       // PA_CL_VPORT_XOFFSET_14
+      0x0,       // PA_CL_VPORT_YSCALE_14
+      0x0,       // PA_CL_VPORT_YOFFSET_14
+      0x0,       // PA_CL_VPORT_ZSCALE_14
+      0x0,       // PA_CL_VPORT_ZOFFSET_14
+      0x0,       // PA_CL_VPORT_XSCALE_15
+      0x0,       // PA_CL_VPORT_XOFFSET_15
+      0x0,       // PA_CL_VPORT_YSCALE_15
+      0x0,       // PA_CL_VPORT_YOFFSET_15
+      0x0,       // PA_CL_VPORT_ZSCALE_15
+      0x0,       // PA_CL_VPORT_ZOFFSET_15
+      0x0,       // PA_CL_UCP_0_X
+      0x0,       // PA_CL_UCP_0_Y
+      0x0,       // PA_CL_UCP_0_Z
+      0x0,       // PA_CL_UCP_0_W
+      0x0,       // PA_CL_UCP_1_X
+      0x0,       // PA_CL_UCP_1_Y
+      0x0,       // PA_CL_UCP_1_Z
+      0x0,       // PA_CL_UCP_1_W
+      0x0,       // PA_CL_UCP_2_X
+      0x0,       // PA_CL_UCP_2_Y
+      0x0,       // PA_CL_UCP_2_Z
+      0x0,       // PA_CL_UCP_2_W
+      0x0,       // PA_CL_UCP_3_X
+      0x0,       // PA_CL_UCP_3_Y
+      0x0,       // PA_CL_UCP_3_Z
+      0x0,       // PA_CL_UCP_3_W
+      0x0,       // PA_CL_UCP_4_X
+      0x0,       // PA_CL_UCP_4_Y
+      0x0,       // PA_CL_UCP_4_Z
+      0x0,       // PA_CL_UCP_4_W
+      0x0,       // PA_CL_UCP_5_X
+      0x0,       // PA_CL_UCP_5_Y
+      0x0,       // PA_CL_UCP_5_Z
+      0x0        // PA_CL_UCP_5_W
    };
    static const uint32_t SpiPsInputCntl0Gfx103[] = {
-      0x0       , // SPI_PS_INPUT_CNTL_0
-      0x0       , // SPI_PS_INPUT_CNTL_1
-      0x0       , // SPI_PS_INPUT_CNTL_2
-      0x0       , // SPI_PS_INPUT_CNTL_3
-      0x0       , // SPI_PS_INPUT_CNTL_4
-      0x0       , // SPI_PS_INPUT_CNTL_5
-      0x0       , // SPI_PS_INPUT_CNTL_6
-      0x0       , // SPI_PS_INPUT_CNTL_7
-      0x0       , // SPI_PS_INPUT_CNTL_8
-      0x0       , // SPI_PS_INPUT_CNTL_9
-      0x0       , // SPI_PS_INPUT_CNTL_10
-      0x0       , // SPI_PS_INPUT_CNTL_11
-      0x0       , // SPI_PS_INPUT_CNTL_12
-      0x0       , // SPI_PS_INPUT_CNTL_13
-      0x0       , // SPI_PS_INPUT_CNTL_14
-      0x0       , // SPI_PS_INPUT_CNTL_15
-      0x0       , // SPI_PS_INPUT_CNTL_16
-      0x0       , // SPI_PS_INPUT_CNTL_17
-      0x0       , // SPI_PS_INPUT_CNTL_18
-      0x0       , // SPI_PS_INPUT_CNTL_19
-      0x0       , // SPI_PS_INPUT_CNTL_20
-      0x0       , // SPI_PS_INPUT_CNTL_21
-      0x0       , // SPI_PS_INPUT_CNTL_22
-      0x0       , // SPI_PS_INPUT_CNTL_23
-      0x0       , // SPI_PS_INPUT_CNTL_24
-      0x0       , // SPI_PS_INPUT_CNTL_25
-      0x0       , // SPI_PS_INPUT_CNTL_26
-      0x0       , // SPI_PS_INPUT_CNTL_27
-      0x0       , // SPI_PS_INPUT_CNTL_28
-      0x0       , // SPI_PS_INPUT_CNTL_29
-      0x0       , // SPI_PS_INPUT_CNTL_30
-      0x0       , // SPI_PS_INPUT_CNTL_31
-      0x0       , // SPI_VS_OUT_CONFIG
-      0x0       , //
-      0x0       , // SPI_PS_INPUT_ENA
-      0x0       , // SPI_PS_INPUT_ADDR
-      0x0       , // SPI_INTERP_CONTROL_0
-      0x2       , // SPI_PS_IN_CONTROL
-      0x0       , //
-      0x0       , // SPI_BARYC_CNTL
-      0x0       , //
-      0x0       , // SPI_TMPRING_SIZE
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , // SPI_SHADER_IDX_FORMAT
-      0x0       , // SPI_SHADER_POS_FORMAT
-      0x0       , // SPI_SHADER_Z_FORMAT
-      0x0         // SPI_SHADER_COL_FORMAT
+      0x0, // SPI_PS_INPUT_CNTL_0
+      0x0, // SPI_PS_INPUT_CNTL_1
+      0x0, // SPI_PS_INPUT_CNTL_2
+      0x0, // SPI_PS_INPUT_CNTL_3
+      0x0, // SPI_PS_INPUT_CNTL_4
+      0x0, // SPI_PS_INPUT_CNTL_5
+      0x0, // SPI_PS_INPUT_CNTL_6
+      0x0, // SPI_PS_INPUT_CNTL_7
+      0x0, // SPI_PS_INPUT_CNTL_8
+      0x0, // SPI_PS_INPUT_CNTL_9
+      0x0, // SPI_PS_INPUT_CNTL_10
+      0x0, // SPI_PS_INPUT_CNTL_11
+      0x0, // SPI_PS_INPUT_CNTL_12
+      0x0, // SPI_PS_INPUT_CNTL_13
+      0x0, // SPI_PS_INPUT_CNTL_14
+      0x0, // SPI_PS_INPUT_CNTL_15
+      0x0, // SPI_PS_INPUT_CNTL_16
+      0x0, // SPI_PS_INPUT_CNTL_17
+      0x0, // SPI_PS_INPUT_CNTL_18
+      0x0, // SPI_PS_INPUT_CNTL_19
+      0x0, // SPI_PS_INPUT_CNTL_20
+      0x0, // SPI_PS_INPUT_CNTL_21
+      0x0, // SPI_PS_INPUT_CNTL_22
+      0x0, // SPI_PS_INPUT_CNTL_23
+      0x0, // SPI_PS_INPUT_CNTL_24
+      0x0, // SPI_PS_INPUT_CNTL_25
+      0x0, // SPI_PS_INPUT_CNTL_26
+      0x0, // SPI_PS_INPUT_CNTL_27
+      0x0, // SPI_PS_INPUT_CNTL_28
+      0x0, // SPI_PS_INPUT_CNTL_29
+      0x0, // SPI_PS_INPUT_CNTL_30
+      0x0, // SPI_PS_INPUT_CNTL_31
+      0x0, // SPI_VS_OUT_CONFIG
+      0x0, //
+      0x0, // SPI_PS_INPUT_ENA
+      0x0, // SPI_PS_INPUT_ADDR
+      0x0, // SPI_INTERP_CONTROL_0
+      0x2, // SPI_PS_IN_CONTROL
+      0x0, //
+      0x0, // SPI_BARYC_CNTL
+      0x0, //
+      0x0, // SPI_TMPRING_SIZE
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, // SPI_SHADER_IDX_FORMAT
+      0x0, // SPI_SHADER_POS_FORMAT
+      0x0, // SPI_SHADER_Z_FORMAT
+      0x0  // SPI_SHADER_COL_FORMAT
    };
    static const uint32_t SxPsDownconvertControlGfx103[] = {
-      0x0       , // SX_PS_DOWNCONVERT_CONTROL
-      0x0       , // SX_PS_DOWNCONVERT
-      0x0       , // SX_BLEND_OPT_EPSILON
-      0x0       , // SX_BLEND_OPT_CONTROL
-      0x0       , // SX_MRT0_BLEND_OPT
-      0x0       , // SX_MRT1_BLEND_OPT
-      0x0       , // SX_MRT2_BLEND_OPT
-      0x0       , // SX_MRT3_BLEND_OPT
-      0x0       , // SX_MRT4_BLEND_OPT
-      0x0       , // SX_MRT5_BLEND_OPT
-      0x0       , // SX_MRT6_BLEND_OPT
-      0x0       , // SX_MRT7_BLEND_OPT
-      0x0       , // CB_BLEND0_CONTROL
-      0x0       , // CB_BLEND1_CONTROL
-      0x0       , // CB_BLEND2_CONTROL
-      0x0       , // CB_BLEND3_CONTROL
-      0x0       , // CB_BLEND4_CONTROL
-      0x0       , // CB_BLEND5_CONTROL
-      0x0       , // CB_BLEND6_CONTROL
-      0x0         // CB_BLEND7_CONTROL
+      0x0, // SX_PS_DOWNCONVERT_CONTROL
+      0x0, // SX_PS_DOWNCONVERT
+      0x0, // SX_BLEND_OPT_EPSILON
+      0x0, // SX_BLEND_OPT_CONTROL
+      0x0, // SX_MRT0_BLEND_OPT
+      0x0, // SX_MRT1_BLEND_OPT
+      0x0, // SX_MRT2_BLEND_OPT
+      0x0, // SX_MRT3_BLEND_OPT
+      0x0, // SX_MRT4_BLEND_OPT
+      0x0, // SX_MRT5_BLEND_OPT
+      0x0, // SX_MRT6_BLEND_OPT
+      0x0, // SX_MRT7_BLEND_OPT
+      0x0, // CB_BLEND0_CONTROL
+      0x0, // CB_BLEND1_CONTROL
+      0x0, // CB_BLEND2_CONTROL
+      0x0, // CB_BLEND3_CONTROL
+      0x0, // CB_BLEND4_CONTROL
+      0x0, // CB_BLEND5_CONTROL
+      0x0, // CB_BLEND6_CONTROL
+      0x0  // CB_BLEND7_CONTROL
    };
    static const uint32_t GeMaxOutputPerSubgroupGfx103[] = {
-      0x0       , // GE_MAX_OUTPUT_PER_SUBGROUP
-      0x0       , // DB_DEPTH_CONTROL
-      0x0       , // DB_EQAA
-      0x0       , // CB_COLOR_CONTROL
-      0x0       , // DB_SHADER_CONTROL
-      0x90000   , // PA_CL_CLIP_CNTL
-      0x4       , // PA_SU_SC_MODE_CNTL
-      0x0       , // PA_CL_VTE_CNTL
-      0x0       , // PA_CL_VS_OUT_CNTL
-      0x0         // PA_CL_NANINF_CNTL
+      0x0,     // GE_MAX_OUTPUT_PER_SUBGROUP
+      0x0,     // DB_DEPTH_CONTROL
+      0x0,     // DB_EQAA
+      0x0,     // CB_COLOR_CONTROL
+      0x0,     // DB_SHADER_CONTROL
+      0x90000, // PA_CL_CLIP_CNTL
+      0x4,     // PA_SU_SC_MODE_CNTL
+      0x0,     // PA_CL_VTE_CNTL
+      0x0,     // PA_CL_VS_OUT_CNTL
+      0x0      // PA_CL_NANINF_CNTL
    };
    static const uint32_t PaSuPrimFilterCntlGfx103[] = {
-      0x0       , // PA_SU_PRIM_FILTER_CNTL
-      0x0       , // PA_SU_SMALL_PRIM_FILTER_CNTL
-      0x0       , //
-      0x0       , // PA_CL_NGG_CNTL
-      0x0       , // PA_SU_OVER_RASTERIZATION_CNTL
-      0x0       , // PA_STEREO_CNTL
-      0x0       , // PA_STATE_STEREO_X
-      0x0         //
+      0x0, // PA_SU_PRIM_FILTER_CNTL
+      0x0, // PA_SU_SMALL_PRIM_FILTER_CNTL
+      0x0, //
+      0x0, // PA_CL_NGG_CNTL
+      0x0, // PA_SU_OVER_RASTERIZATION_CNTL
+      0x0, // PA_STEREO_CNTL
+      0x0, // PA_STATE_STEREO_X
+      0x0  //
    };
    static const uint32_t PaSuPointSizeGfx103[] = {
-      0x0       , // PA_SU_POINT_SIZE
-      0x0       , // PA_SU_POINT_MINMAX
-      0x0       , // PA_SU_LINE_CNTL
-      0x0         // PA_SC_LINE_STIPPLE
+      0x0, // PA_SU_POINT_SIZE
+      0x0, // PA_SU_POINT_MINMAX
+      0x0, // PA_SU_LINE_CNTL
+      0x0  // PA_SC_LINE_STIPPLE
    };
    static const uint32_t VgtHosMaxTessLevelGfx103[] = {
-      0x0       , // VGT_HOS_MAX_TESS_LEVEL
-      0x0         // VGT_HOS_MIN_TESS_LEVEL
+      0x0, // VGT_HOS_MAX_TESS_LEVEL
+      0x0  // VGT_HOS_MIN_TESS_LEVEL
    };
    static const uint32_t VgtGsModeGfx103[] = {
-      0x0       , // VGT_GS_MODE
-      0x0       , // VGT_GS_ONCHIP_CNTL
-      0x0       , // PA_SC_MODE_CNTL_0
-      0x0       , // PA_SC_MODE_CNTL_1
-      0x0       , // VGT_ENHANCE
-      0x100     , // VGT_GS_PER_ES
-      0x80      , // VGT_ES_PER_GS
-      0x2       , // VGT_GS_PER_VS
-      0x0       , // VGT_GSVS_RING_OFFSET_1
-      0x0       , // VGT_GSVS_RING_OFFSET_2
-      0x0       , // VGT_GSVS_RING_OFFSET_3
-      0x0         // VGT_GS_OUT_PRIM_TYPE
+      0x0,   // VGT_GS_MODE
+      0x0,   // VGT_GS_ONCHIP_CNTL
+      0x0,   // PA_SC_MODE_CNTL_0
+      0x0,   // PA_SC_MODE_CNTL_1
+      0x0,   // VGT_ENHANCE
+      0x100, // VGT_GS_PER_ES
+      0x80 // VGT_ES_PER_GS
+      0x2,   // VGT_GS_PER_VS
+      0x0,   // VGT_GSVS_RING_OFFSET_1
+      0x0,   // VGT_GSVS_RING_OFFSET_2
+      0x0,   // VGT_GSVS_RING_OFFSET_3
+      0x0    // VGT_GS_OUT_PRIM_TYPE
    };
    static const uint32_t VgtPrimitiveidEnGfx103[] = {
-      0x0         // VGT_PRIMITIVEID_EN
+      0x0 // VGT_PRIMITIVEID_EN
    };
    static const uint32_t VgtPrimitiveidResetGfx103[] = {
-      0x0         // VGT_PRIMITIVEID_RESET
+      0x0 // VGT_PRIMITIVEID_RESET
    };
    static const uint32_t VgtDrawPayloadCntlGfx103[] = {
-      0x0       , // VGT_DRAW_PAYLOAD_CNTL
-      0x0       , //
-      0x0       , // VGT_INSTANCE_STEP_RATE_0
-      0x0       , // VGT_INSTANCE_STEP_RATE_1
-      0x0       , // IA_MULTI_VGT_PARAM
-      0x0       , // VGT_ESGS_RING_ITEMSIZE
-      0x0       , // VGT_GSVS_RING_ITEMSIZE
-      0x0       , // VGT_REUSE_OFF
-      0x0       , // VGT_VTX_CNT_EN
-      0x0       , // DB_HTILE_SURFACE
-      0x0       , // DB_SRESULTS_COMPARE_STATE0
-      0x0       , // DB_SRESULTS_COMPARE_STATE1
-      0x0       , // DB_PRELOAD_CONTROL
-      0x0       , //
-      0x0       , // VGT_STRMOUT_BUFFER_SIZE_0
-      0x0       , // VGT_STRMOUT_VTX_STRIDE_0
-      0x0       , //
-      0x0       , // VGT_STRMOUT_BUFFER_OFFSET_0
-      0x0       , // VGT_STRMOUT_BUFFER_SIZE_1
-      0x0       , // VGT_STRMOUT_VTX_STRIDE_1
-      0x0       , //
-      0x0       , // VGT_STRMOUT_BUFFER_OFFSET_1
-      0x0       , // VGT_STRMOUT_BUFFER_SIZE_2
-      0x0       , // VGT_STRMOUT_VTX_STRIDE_2
-      0x0       , //
-      0x0       , // VGT_STRMOUT_BUFFER_OFFSET_2
-      0x0       , // VGT_STRMOUT_BUFFER_SIZE_3
-      0x0       , // VGT_STRMOUT_VTX_STRIDE_3
-      0x0       , //
-      0x0       , // VGT_STRMOUT_BUFFER_OFFSET_3
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , // VGT_STRMOUT_DRAW_OPAQUE_OFFSET
-      0x0       , // VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE
-      0x0       , // VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE
-      0x0       , //
-      0x0       , // VGT_GS_MAX_VERT_OUT
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , //
-      0x0       , // GE_NGG_SUBGRP_CNTL
-      0x0       , // VGT_TESS_DISTRIBUTION
-      0x0       , // VGT_SHADER_STAGES_EN
-      0x0       , // VGT_LS_HS_CONFIG
-      0x0       , // VGT_GS_VERT_ITEMSIZE
-      0x0       , // VGT_GS_VERT_ITEMSIZE_1
-      0x0       , // VGT_GS_VERT_ITEMSIZE_2
-      0x0       , // VGT_GS_VERT_ITEMSIZE_3
-      0x0       , // VGT_TF_PARAM
-      0x0       , // DB_ALPHA_TO_MASK
-      0x0       , //
-      0x0       , // PA_SU_POLY_OFFSET_DB_FMT_CNTL
-      0x0       , // PA_SU_POLY_OFFSET_CLAMP
-      0x0       , // PA_SU_POLY_OFFSET_FRONT_SCALE
-      0x0       , // PA_SU_POLY_OFFSET_FRONT_OFFSET
-      0x0       , // PA_SU_POLY_OFFSET_BACK_SCALE
-      0x0       , // PA_SU_POLY_OFFSET_BACK_OFFSET
-      0x0       , // VGT_GS_INSTANCE_CNT
-      0x0       , // VGT_STRMOUT_CONFIG
-      0x0         // VGT_STRMOUT_BUFFER_CONFIG
+      0x0, // VGT_DRAW_PAYLOAD_CNTL
+      0x0, //
+      0x0, // VGT_INSTANCE_STEP_RATE_0
+      0x0, // VGT_INSTANCE_STEP_RATE_1
+      0x0, // IA_MULTI_VGT_PARAM
+      0x0, // VGT_ESGS_RING_ITEMSIZE
+      0x0, // VGT_GSVS_RING_ITEMSIZE
+      0x0, // VGT_REUSE_OFF
+      0x0, // VGT_VTX_CNT_EN
+      0x0, // DB_HTILE_SURFACE
+      0x0, // DB_SRESULTS_COMPARE_STATE0
+      0x0, // DB_SRESULTS_COMPARE_STATE1
+      0x0, // DB_PRELOAD_CONTROL
+      0x0, //
+      0x0, // VGT_STRMOUT_BUFFER_SIZE_0
+      0x0, // VGT_STRMOUT_VTX_STRIDE_0
+      0x0, //
+      0x0, // VGT_STRMOUT_BUFFER_OFFSET_0
+      0x0, // VGT_STRMOUT_BUFFER_SIZE_1
+      0x0, // VGT_STRMOUT_VTX_STRIDE_1
+      0x0, //
+      0x0, // VGT_STRMOUT_BUFFER_OFFSET_1
+      0x0, // VGT_STRMOUT_BUFFER_SIZE_2
+      0x0, // VGT_STRMOUT_VTX_STRIDE_2
+      0x0, //
+      0x0, // VGT_STRMOUT_BUFFER_OFFSET_2
+      0x0, // VGT_STRMOUT_BUFFER_SIZE_3
+      0x0, // VGT_STRMOUT_VTX_STRIDE_3
+      0x0, //
+      0x0, // VGT_STRMOUT_BUFFER_OFFSET_3
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, // VGT_STRMOUT_DRAW_OPAQUE_OFFSET
+      0x0, // VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE
+      0x0, // VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE
+      0x0, //
+      0x0, // VGT_GS_MAX_VERT_OUT
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, //
+      0x0, // GE_NGG_SUBGRP_CNTL
+      0x0, // VGT_TESS_DISTRIBUTION
+      0x0, // VGT_SHADER_STAGES_EN
+      0x0, // VGT_LS_HS_CONFIG
+      0x0, // VGT_GS_VERT_ITEMSIZE
+      0x0, // VGT_GS_VERT_ITEMSIZE_1
+      0x0, // VGT_GS_VERT_ITEMSIZE_2
+      0x0, // VGT_GS_VERT_ITEMSIZE_3
+      0x0, // VGT_TF_PARAM
+      0x0, // DB_ALPHA_TO_MASK
+      0x0, //
+      0x0, // PA_SU_POLY_OFFSET_DB_FMT_CNTL
+      0x0, // PA_SU_POLY_OFFSET_CLAMP
+      0x0, // PA_SU_POLY_OFFSET_FRONT_SCALE
+      0x0, // PA_SU_POLY_OFFSET_FRONT_OFFSET
+      0x0, // PA_SU_POLY_OFFSET_BACK_SCALE
+      0x0, // PA_SU_POLY_OFFSET_BACK_OFFSET
+      0x0, // VGT_GS_INSTANCE_CNT
+      0x0, // VGT_STRMOUT_CONFIG
+      0x0  // VGT_STRMOUT_BUFFER_CONFIG
    };
    static const uint32_t PaScCentroidPriority0Gfx103[] = {
-      0x0       , // PA_SC_CENTROID_PRIORITY_0
-      0x0       , // PA_SC_CENTROID_PRIORITY_1
-      0x1000    , // PA_SC_LINE_CNTL
-      0x0       , // PA_SC_AA_CONFIG
-      0x5       , // PA_SU_VTX_CNTL
+      0x0,        // PA_SC_CENTROID_PRIORITY_0
+      0x0,        // PA_SC_CENTROID_PRIORITY_1
+      0x1000,     // PA_SC_LINE_CNTL
+      0x0,        // PA_SC_AA_CONFIG
+      0x5,        // PA_SU_VTX_CNTL
       0x3f800000, // PA_CL_GB_VERT_CLIP_ADJ
       0x3f800000, // PA_CL_GB_VERT_DISC_ADJ
       0x3f800000, // PA_CL_GB_HORZ_CLIP_ADJ
       0x3f800000, // PA_CL_GB_HORZ_DISC_ADJ
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2
-      0x0       , // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2
+      0x0,        // PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3
       0xffffffff, // PA_SC_AA_MASK_X0Y0_X1Y0
       0xffffffff, // PA_SC_AA_MASK_X0Y1_X1Y1
-      0x0       , // PA_SC_SHADER_CONTROL
-      0x3       , // PA_SC_BINNER_CNTL_0
-      0x0       , // PA_SC_BINNER_CNTL_1
-      0x100000  , // PA_SC_CONSERVATIVE_RASTERIZATION_CNTL
-      0x0       , // PA_SC_NGG_MODE_CNTL
-      0x0       , //
-      0x1e      , // VGT_VERTEX_REUSE_BLOCK_CNTL
-      0x20      , // VGT_OUT_DEALLOC_CNTL
-      0x0       , // CB_COLOR0_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , // CB_COLOR0_VIEW
-      0x0       , // CB_COLOR0_INFO
-      0x0       , // CB_COLOR0_ATTRIB
-      0x0       , // CB_COLOR0_DCC_CONTROL
-      0x0       , // CB_COLOR0_CMASK
-      0x0       , //
-      0x0       , // CB_COLOR0_FMASK
-      0x0       , //
-      0x0       , // CB_COLOR0_CLEAR_WORD0
-      0x0       , // CB_COLOR0_CLEAR_WORD1
-      0x0       , // CB_COLOR0_DCC_BASE
-      0x0       , //
-      0x0       , // CB_COLOR1_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , // CB_COLOR1_VIEW
-      0x0       , // CB_COLOR1_INFO
-      0x0       , // CB_COLOR1_ATTRIB
-      0x0       , // CB_COLOR1_DCC_CONTROL
-      0x0       , // CB_COLOR1_CMASK
-      0x0       , //
-      0x0       , // CB_COLOR1_FMASK
-      0x0       , //
-      0x0       , // CB_COLOR1_CLEAR_WORD0
-      0x0       , // CB_COLOR1_CLEAR_WORD1
-      0x0       , // CB_COLOR1_DCC_BASE
-      0x0       , //
-      0x0       , // CB_COLOR2_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , // CB_COLOR2_VIEW
-      0x0       , // CB_COLOR2_INFO
-      0x0       , // CB_COLOR2_ATTRIB
-      0x0       , // CB_COLOR2_DCC_CONTROL
-      0x0       , // CB_COLOR2_CMASK
-      0x0       , //
-      0x0       , // CB_COLOR2_FMASK
-      0x0       , //
-      0x0       , // CB_COLOR2_CLEAR_WORD0
-      0x0       , // CB_COLOR2_CLEAR_WORD1
-      0x0       , // CB_COLOR2_DCC_BASE
-      0x0       , //
-      0x0       , // CB_COLOR3_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , // CB_COLOR3_VIEW
-      0x0       , // CB_COLOR3_INFO
-      0x0       , // CB_COLOR3_ATTRIB
-      0x0       , // CB_COLOR3_DCC_CONTROL
-      0x0       , // CB_COLOR3_CMASK
-      0x0       , //
-      0x0       , // CB_COLOR3_FMASK
-      0x0       , //
-      0x0       , // CB_COLOR3_CLEAR_WORD0
-      0x0       , // CB_COLOR3_CLEAR_WORD1
-      0x0       , // CB_COLOR3_DCC_BASE
-      0x0       , //
-      0x0       , // CB_COLOR4_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , // CB_COLOR4_VIEW
-      0x0       , // CB_COLOR4_INFO
-      0x0       , // CB_COLOR4_ATTRIB
-      0x0       , // CB_COLOR4_DCC_CONTROL
-      0x0       , // CB_COLOR4_CMASK
-      0x0       , //
-      0x0       , // CB_COLOR4_FMASK
-      0x0       , //
-      0x0       , // CB_COLOR4_CLEAR_WORD0
-      0x0       , // CB_COLOR4_CLEAR_WORD1
-      0x0       , // CB_COLOR4_DCC_BASE
-      0x0       , //
-      0x0       , // CB_COLOR5_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , // CB_COLOR5_VIEW
-      0x0       , // CB_COLOR5_INFO
-      0x0       , // CB_COLOR5_ATTRIB
-      0x0       , // CB_COLOR5_DCC_CONTROL
-      0x0       , // CB_COLOR5_CMASK
-      0x0       , //
-      0x0       , // CB_COLOR5_FMASK
-      0x0       , //
-      0x0       , // CB_COLOR5_CLEAR_WORD0
-      0x0       , // CB_COLOR5_CLEAR_WORD1
-      0x0       , // CB_COLOR5_DCC_BASE
-      0x0       , //
-      0x0       , // CB_COLOR6_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , // CB_COLOR6_VIEW
-      0x0       , // CB_COLOR6_INFO
-      0x0       , // CB_COLOR6_ATTRIB
-      0x0       , // CB_COLOR6_DCC_CONTROL
-      0x0       , // CB_COLOR6_CMASK
-      0x0       , //
-      0x0       , // CB_COLOR6_FMASK
-      0x0       , //
-      0x0       , // CB_COLOR6_CLEAR_WORD0
-      0x0       , // CB_COLOR6_CLEAR_WORD1
-      0x0       , // CB_COLOR6_DCC_BASE
-      0x0       , //
-      0x0       , // CB_COLOR7_BASE
-      0x0       , //
-      0x0       , //
-      0x0       , // CB_COLOR7_VIEW
-      0x0       , // CB_COLOR7_INFO
-      0x0       , // CB_COLOR7_ATTRIB
-      0x0       , // CB_COLOR7_DCC_CONTROL
-      0x0       , // CB_COLOR7_CMASK
-      0x0       , //
-      0x0       , // CB_COLOR7_FMASK
-      0x0       , //
-      0x0       , // CB_COLOR7_CLEAR_WORD0
-      0x0       , // CB_COLOR7_CLEAR_WORD1
-      0x0       , // CB_COLOR7_DCC_BASE
-      0x0       , //
-      0x0       , // CB_COLOR0_BASE_EXT
-      0x0       , // CB_COLOR1_BASE_EXT
-      0x0       , // CB_COLOR2_BASE_EXT
-      0x0       , // CB_COLOR3_BASE_EXT
-      0x0       , // CB_COLOR4_BASE_EXT
-      0x0       , // CB_COLOR5_BASE_EXT
-      0x0       , // CB_COLOR6_BASE_EXT
-      0x0       , // CB_COLOR7_BASE_EXT
-      0x0       , // CB_COLOR0_CMASK_BASE_EXT
-      0x0       , // CB_COLOR1_CMASK_BASE_EXT
-      0x0       , // CB_COLOR2_CMASK_BASE_EXT
-      0x0       , // CB_COLOR3_CMASK_BASE_EXT
-      0x0       , // CB_COLOR4_CMASK_BASE_EXT
-      0x0       , // CB_COLOR5_CMASK_BASE_EXT
-      0x0       , // CB_COLOR6_CMASK_BASE_EXT
-      0x0       , // CB_COLOR7_CMASK_BASE_EXT
-      0x0       , // CB_COLOR0_FMASK_BASE_EXT
-      0x0       , // CB_COLOR1_FMASK_BASE_EXT
-      0x0       , // CB_COLOR2_FMASK_BASE_EXT
-      0x0       , // CB_COLOR3_FMASK_BASE_EXT
-      0x0       , // CB_COLOR4_FMASK_BASE_EXT
-      0x0       , // CB_COLOR5_FMASK_BASE_EXT
-      0x0       , // CB_COLOR6_FMASK_BASE_EXT
-      0x0       , // CB_COLOR7_FMASK_BASE_EXT
-      0x0       , // CB_COLOR0_DCC_BASE_EXT
-      0x0       , // CB_COLOR1_DCC_BASE_EXT
-      0x0       , // CB_COLOR2_DCC_BASE_EXT
-      0x0       , // CB_COLOR3_DCC_BASE_EXT
-      0x0       , // CB_COLOR4_DCC_BASE_EXT
-      0x0       , // CB_COLOR5_DCC_BASE_EXT
-      0x0       , // CB_COLOR6_DCC_BASE_EXT
-      0x0       , // CB_COLOR7_DCC_BASE_EXT
-      0x0       , // CB_COLOR0_ATTRIB2
-      0x0       , // CB_COLOR1_ATTRIB2
-      0x0       , // CB_COLOR2_ATTRIB2
-      0x0       , // CB_COLOR3_ATTRIB2
-      0x0       , // CB_COLOR4_ATTRIB2
-      0x0       , // CB_COLOR5_ATTRIB2
-      0x0       , // CB_COLOR6_ATTRIB2
-      0x0       , // CB_COLOR7_ATTRIB2
-      0x0       , // CB_COLOR0_ATTRIB3
-      0x0       , // CB_COLOR1_ATTRIB3
-      0x0       , // CB_COLOR2_ATTRIB3
-      0x0       , // CB_COLOR3_ATTRIB3
-      0x0       , // CB_COLOR4_ATTRIB3
-      0x0       , // CB_COLOR5_ATTRIB3
-      0x0       , // CB_COLOR6_ATTRIB3
+      0x0,        // PA_SC_SHADER_CONTROL
+      0x3,        // PA_SC_BINNER_CNTL_0
+      0x0,        // PA_SC_BINNER_CNTL_1
+      0x100000,   // PA_SC_CONSERVATIVE_RASTERIZATION_CNTL
+      0x0,        // PA_SC_NGG_MODE_CNTL
+      0x0,        //
+      0x1e,       // VGT_VERTEX_REUSE_BLOCK_CNTL
+      0x20,       // VGT_OUT_DEALLOC_CNTL
+      0x0,        // CB_COLOR0_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        // CB_COLOR0_VIEW
+      0x0,        // CB_COLOR0_INFO
+      0x0,        // CB_COLOR0_ATTRIB
+      0x0,        // CB_COLOR0_DCC_CONTROL
+      0x0,        // CB_COLOR0_CMASK
+      0x0,        //
+      0x0,        // CB_COLOR0_FMASK
+      0x0,        //
+      0x0,        // CB_COLOR0_CLEAR_WORD0
+      0x0,        // CB_COLOR0_CLEAR_WORD1
+      0x0,        // CB_COLOR0_DCC_BASE
+      0x0,        //
+      0x0,        // CB_COLOR1_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        // CB_COLOR1_VIEW
+      0x0,        // CB_COLOR1_INFO
+      0x0,        // CB_COLOR1_ATTRIB
+      0x0,        // CB_COLOR1_DCC_CONTROL
+      0x0,        // CB_COLOR1_CMASK
+      0x0,        //
+      0x0,        // CB_COLOR1_FMASK
+      0x0,        //
+      0x0,        // CB_COLOR1_CLEAR_WORD0
+      0x0,        // CB_COLOR1_CLEAR_WORD1
+      0x0,        // CB_COLOR1_DCC_BASE
+      0x0,        //
+      0x0,        // CB_COLOR2_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        // CB_COLOR2_VIEW
+      0x0,        // CB_COLOR2_INFO
+      0x0,        // CB_COLOR2_ATTRIB
+      0x0,        // CB_COLOR2_DCC_CONTROL
+      0x0,        // CB_COLOR2_CMASK
+      0x0,        //
+      0x0,        // CB_COLOR2_FMASK
+      0x0,        //
+      0x0,        // CB_COLOR2_CLEAR_WORD0
+      0x0,        // CB_COLOR2_CLEAR_WORD1
+      0x0,        // CB_COLOR2_DCC_BASE
+      0x0,        //
+      0x0,        // CB_COLOR3_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        // CB_COLOR3_VIEW
+      0x0,        // CB_COLOR3_INFO
+      0x0,        // CB_COLOR3_ATTRIB
+      0x0,        // CB_COLOR3_DCC_CONTROL
+      0x0,        // CB_COLOR3_CMASK
+      0x0,        //
+      0x0,        // CB_COLOR3_FMASK
+      0x0,        //
+      0x0,        // CB_COLOR3_CLEAR_WORD0
+      0x0,        // CB_COLOR3_CLEAR_WORD1
+      0x0,        // CB_COLOR3_DCC_BASE
+      0x0,        //
+      0x0,        // CB_COLOR4_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        // CB_COLOR4_VIEW
+      0x0,        // CB_COLOR4_INFO
+      0x0,        // CB_COLOR4_ATTRIB
+      0x0,        // CB_COLOR4_DCC_CONTROL
+      0x0,        // CB_COLOR4_CMASK
+      0x0,        //
+      0x0,        // CB_COLOR4_FMASK
+      0x0,        //
+      0x0,        // CB_COLOR4_CLEAR_WORD0
+      0x0,        // CB_COLOR4_CLEAR_WORD1
+      0x0,        // CB_COLOR4_DCC_BASE
+      0x0,        //
+      0x0,        // CB_COLOR5_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        // CB_COLOR5_VIEW
+      0x0,        // CB_COLOR5_INFO
+      0x0,        // CB_COLOR5_ATTRIB
+      0x0,        // CB_COLOR5_DCC_CONTROL
+      0x0,        // CB_COLOR5_CMASK
+      0x0,        //
+      0x0,        // CB_COLOR5_FMASK
+      0x0,        //
+      0x0,        // CB_COLOR5_CLEAR_WORD0
+      0x0,        // CB_COLOR5_CLEAR_WORD1
+      0x0,        // CB_COLOR5_DCC_BASE
+      0x0,        //
+      0x0,        // CB_COLOR6_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        // CB_COLOR6_VIEW
+      0x0,        // CB_COLOR6_INFO
+      0x0,        // CB_COLOR6_ATTRIB
+      0x0,        // CB_COLOR6_DCC_CONTROL
+      0x0,        // CB_COLOR6_CMASK
+      0x0,        //
+      0x0,        // CB_COLOR6_FMASK
+      0x0,        //
+      0x0,        // CB_COLOR6_CLEAR_WORD0
+      0x0,        // CB_COLOR6_CLEAR_WORD1
+      0x0,        // CB_COLOR6_DCC_BASE
+      0x0,        //
+      0x0,        // CB_COLOR7_BASE
+      0x0,        //
+      0x0,        //
+      0x0,        // CB_COLOR7_VIEW
+      0x0,        // CB_COLOR7_INFO
+      0x0,        // CB_COLOR7_ATTRIB
+      0x0,        // CB_COLOR7_DCC_CONTROL
+      0x0,        // CB_COLOR7_CMASK
+      0x0,        //
+      0x0,        // CB_COLOR7_FMASK
+      0x0,        //
+      0x0,        // CB_COLOR7_CLEAR_WORD0
+      0x0,        // CB_COLOR7_CLEAR_WORD1
+      0x0,        // CB_COLOR7_DCC_BASE
+      0x0,        //
+      0x0,        // CB_COLOR0_BASE_EXT
+      0x0,        // CB_COLOR1_BASE_EXT
+      0x0,        // CB_COLOR2_BASE_EXT
+      0x0,        // CB_COLOR3_BASE_EXT
+      0x0,        // CB_COLOR4_BASE_EXT
+      0x0,        // CB_COLOR5_BASE_EXT
+      0x0,        // CB_COLOR6_BASE_EXT
+      0x0,        // CB_COLOR7_BASE_EXT
+      0x0,        // CB_COLOR0_CMASK_BASE_EXT
+      0x0,        // CB_COLOR1_CMASK_BASE_EXT
+      0x0,        // CB_COLOR2_CMASK_BASE_EXT
+      0x0,        // CB_COLOR3_CMASK_BASE_EXT
+      0x0,        // CB_COLOR4_CMASK_BASE_EXT
+      0x0,        // CB_COLOR5_CMASK_BASE_EXT
+      0x0,        // CB_COLOR6_CMASK_BASE_EXT
+      0x0,        // CB_COLOR7_CMASK_BASE_EXT
+      0x0,        // CB_COLOR0_FMASK_BASE_EXT
+      0x0,        // CB_COLOR1_FMASK_BASE_EXT
+      0x0,        // CB_COLOR2_FMASK_BASE_EXT
+      0x0,        // CB_COLOR3_FMASK_BASE_EXT
+      0x0,        // CB_COLOR4_FMASK_BASE_EXT
+      0x0,        // CB_COLOR5_FMASK_BASE_EXT
+      0x0,        // CB_COLOR6_FMASK_BASE_EXT
+      0x0,        // CB_COLOR7_FMASK_BASE_EXT
+      0x0,        // CB_COLOR0_DCC_BASE_EXT
+      0x0,        // CB_COLOR1_DCC_BASE_EXT
+      0x0,        // CB_COLOR2_DCC_BASE_EXT
+      0x0,        // CB_COLOR3_DCC_BASE_EXT
+      0x0,        // CB_COLOR4_DCC_BASE_EXT
+      0x0,        // CB_COLOR5_DCC_BASE_EXT
+      0x0,        // CB_COLOR6_DCC_BASE_EXT
+      0x0,        // CB_COLOR7_DCC_BASE_EXT
+      0x0,        // CB_COLOR0_ATTRIB2
+      0x0,        // CB_COLOR1_ATTRIB2
+      0x0,        // CB_COLOR2_ATTRIB2
+      0x0,        // CB_COLOR3_ATTRIB2
+      0x0,        // CB_COLOR4_ATTRIB2
+      0x0,        // CB_COLOR5_ATTRIB2
+      0x0,        // CB_COLOR6_ATTRIB2
+      0x0,        // CB_COLOR7_ATTRIB2
+      0x0,        // CB_COLOR0_ATTRIB3
+      0x0,        // CB_COLOR1_ATTRIB3
+      0x0,        // CB_COLOR2_ATTRIB3
+      0x0,        // CB_COLOR3_ATTRIB3
+      0x0,        // CB_COLOR4_ATTRIB3
+      0x0,        // CB_COLOR5_ATTRIB3
+      0x0,        // CB_COLOR6_ATTRIB3
       0x0         // CB_COLOR7_ATTRIB3
    };
 
    set_context_reg_seq_array(cs, R_028000_DB_RENDER_CONTROL, SET(DbRenderControlGfx103));
    set_context_reg_seq_array(cs, R_0281E8_COHER_DEST_BASE_HI_0, SET(CoherDestBaseHi0Gfx103));
-   set_context_reg_seq_array(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, SET(VgtMultiPrimIbResetIndxGfx103));
+   set_context_reg_seq_array(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
+                             SET(VgtMultiPrimIbResetIndxGfx103));
    set_context_reg_seq_array(cs, R_028644_SPI_PS_INPUT_CNTL_0, SET(SpiPsInputCntl0Gfx103));
-   set_context_reg_seq_array(cs, R_028750_SX_PS_DOWNCONVERT_CONTROL, SET(SxPsDownconvertControlGfx103));
-   set_context_reg_seq_array(cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, SET(GeMaxOutputPerSubgroupGfx103));
+   set_context_reg_seq_array(cs, R_028750_SX_PS_DOWNCONVERT_CONTROL,
+                             SET(SxPsDownconvertControlGfx103));
+   set_context_reg_seq_array(cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
+                             SET(GeMaxOutputPerSubgroupGfx103));
    set_context_reg_seq_array(cs, R_02882C_PA_SU_PRIM_FILTER_CNTL, SET(PaSuPrimFilterCntlGfx103));
    set_context_reg_seq_array(cs, R_028A00_PA_SU_POINT_SIZE, SET(PaSuPointSizeGfx103));
    set_context_reg_seq_array(cs, R_028A18_VGT_HOS_MAX_TESS_LEVEL, SET(VgtHosMaxTessLevelGfx103));
@@ -2902,14 +2921,14 @@ static void gfx103_emulate_clear_state(struct radeon_cmdbuf *cs, unsigned num_re
    set_context_reg_seq_array(cs, R_028A84_VGT_PRIMITIVEID_EN, SET(VgtPrimitiveidEnGfx103));
    set_context_reg_seq_array(cs, R_028A8C_VGT_PRIMITIVEID_RESET, SET(VgtPrimitiveidResetGfx103));
    set_context_reg_seq_array(cs, R_028A98_VGT_DRAW_PAYLOAD_CNTL, SET(VgtDrawPayloadCntlGfx103));
-   set_context_reg_seq_array(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, SET(PaScCentroidPriority0Gfx103));
+   set_context_reg_seq_array(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0,
+                             SET(PaScCentroidPriority0Gfx103));
 
    for (unsigned i = 0; i < num_reg_pairs; i++)
       set_context_reg_seq_array(cs, reg_offsets[i], 1, &reg_values[i]);
 }
 
-void ac_emulate_clear_state(const struct radeon_info *info,
-                            struct radeon_cmdbuf *cs,
+void ac_emulate_clear_state(const struct radeon_info *info, struct radeon_cmdbuf *cs,
                             set_context_reg_seq_array_fn set_context_reg_seq_array)
 {
    /* Set context registers same as CLEAR_STATE to initialize shadow memory. */
@@ -2917,11 +2936,9 @@ void ac_emulate_clear_state(const struct radeon_info *info,
    uint32_t reg_value = info->pa_sc_tile_steering_override;
 
    if (info->chip_class == GFX10_3) {
-      gfx103_emulate_clear_state(cs, 1, &reg_offset, &reg_value,
-                                 set_context_reg_seq_array);
+      gfx103_emulate_clear_state(cs, 1, &reg_offset, &reg_value, set_context_reg_seq_array);
    } else if (info->chip_class == GFX10) {
-      gfx10_emulate_clear_state(cs, 1, &reg_offset, &reg_value,
-                                set_context_reg_seq_array);
+      gfx10_emulate_clear_state(cs, 1, &reg_offset, &reg_value, set_context_reg_seq_array);
    } else if (info->chip_class == GFX9) {
       gfx9_emulate_clear_state(cs, set_context_reg_seq_array);
    } else {
@@ -2949,8 +2966,7 @@ void ac_check_shadowed_regs(enum chip_class chip_class, enum radeon_family famil
          unsigned end_range_offset = ranges[i].offset + ranges[i].size;
 
          /* Test if the ranges interect. */
-         if (MAX2(ranges[i].offset, reg_offset) <
-             MIN2(end_range_offset, end_reg_offset)) {
+         if (MAX2(ranges[i].offset, reg_offset) < MIN2(end_range_offset, end_reg_offset)) {
             /* Assertion: A register can be listed only once. */
             assert(!found);
             found = true;
@@ -2992,7 +3008,7 @@ void ac_print_shadowed_regs(const struct radeon_info *info)
 
       for (unsigned i = 0; i < num_ranges; i++) {
          for (unsigned j = 0; j < ranges[i].size / 4; j++) {
-            unsigned offset = ranges[i].offset + j*4;
+            unsigned offset = ranges[i].offset + j * 4;
 
             const char *name = ac_get_register_name(info->chip_class, offset);
             unsigned value = -1;
index b3f61db70b280889e8383084da09a67614460ed8..df2a7b72fc4dabede8846c1073ed3fff07ca960c 100644 (file)
@@ -35,7 +35,8 @@ struct ac_reg_range {
    unsigned size;
 };
 
-enum ac_reg_range_type {
+enum ac_reg_range_type
+{
    SI_REG_RANGE_UCONFIG,
    SI_REG_RANGE_CONTEXT,
    SI_REG_RANGE_SH,
@@ -46,14 +47,13 @@ enum ac_reg_range_type {
    SI_NUM_ALL_REG_RANGES,
 };
 
-typedef void (*set_context_reg_seq_array_fn)(struct radeon_cmdbuf *cs, unsigned reg,
-                                             unsigned num, const uint32_t *values);
+typedef void (*set_context_reg_seq_array_fn)(struct radeon_cmdbuf *cs, unsigned reg, unsigned num,
+                                             const uint32_t *values);
 
 void ac_get_reg_ranges(enum chip_class chip_class, enum radeon_family family,
                        enum ac_reg_range_type type, unsigned *num_ranges,
                        const struct ac_reg_range **ranges);
-void ac_emulate_clear_state(const struct radeon_info *info,
-                            struct radeon_cmdbuf *cs,
+void ac_emulate_clear_state(const struct radeon_info *info, struct radeon_cmdbuf *cs,
                             set_context_reg_seq_array_fn set_context_reg_seq_array);
 void ac_check_shadowed_regs(enum chip_class chip_class, enum radeon_family family,
                             unsigned reg_offset, unsigned count);
index 461fd503622fa974062247440e43052556a2d9a7..223a61e4764627ad5f9c81a47f007a6a9dc7f9dd 100644 (file)
  */
 
 #include "ac_surface.h"
-#include "amd_family.h"
-#include "addrlib/src/amdgpu_asic_addr.h"
+
 #include "ac_gpu_info.h"
+#include "addrlib/inc/addrinterface.h"
+#include "addrlib/src/amdgpu_asic_addr.h"
+#include "amd_family.h"
+#include "drm-uapi/amdgpu_drm.h"
+#include "sid.h"
 #include "util/hash_table.h"
 #include "util/macros.h"
 #include "util/simple_mtx.h"
 #include "util/u_atomic.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
-#include "sid.h"
 
+#include <amdgpu.h>
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <amdgpu.h>
-#include "drm-uapi/amdgpu_drm.h"
-
-#include "addrlib/inc/addrinterface.h"
 
 #ifndef CIASICIDGFXENGINE_SOUTHERNISLAND
 #define CIASICIDGFXENGINE_SOUTHERNISLAND 0x0000000A
 #endif
 
 struct ac_addrlib {
-       ADDR_HANDLE handle;
-
-       /* The cache of DCC retile maps for reuse when allocating images of
-        * similar sizes.
-        */
-       simple_mtx_t dcc_retile_map_lock;
-       struct hash_table *dcc_retile_maps;
-       struct hash_table *dcc_retile_tile_indices;
+   ADDR_HANDLE handle;
+
+   /* The cache of DCC retile maps for reuse when allocating images of
+    * similar sizes.
+    */
+   simple_mtx_t dcc_retile_map_lock;
+   struct hash_table *dcc_retile_maps;
+   struct hash_table *dcc_retile_tile_indices;
 };
 
 struct dcc_retile_map_key {
-       enum radeon_family family;
-       unsigned retile_width;
-       unsigned retile_height;
-       bool rb_aligned;
-       bool pipe_aligned;
-       unsigned dcc_retile_num_elements;
-       ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT input;
+   enum radeon_family family;
+   unsigned retile_width;
+   unsigned retile_height;
+   bool rb_aligned;
+   bool pipe_aligned;
+   unsigned dcc_retile_num_elements;
+   ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT input;
 };
 
 static uint32_t dcc_retile_map_hash_key(const void *key)
 {
-       return _mesa_hash_data(key, sizeof(struct dcc_retile_map_key));
+   return _mesa_hash_data(key, sizeof(struct dcc_retile_map_key));
 }
 
 static bool dcc_retile_map_keys_equal(const void *a, const void *b)
 {
-       return memcmp(a, b, sizeof(struct dcc_retile_map_key)) == 0;
+   return memcmp(a, b, sizeof(struct dcc_retile_map_key)) == 0;
 }
 
 static void dcc_retile_map_free(struct hash_entry *entry)
 {
-       free((void*)entry->key);
-       free(entry->data);
+   free((void *)entry->key);
+   free(entry->data);
 }
 
 struct dcc_retile_tile_key {
-       enum radeon_family family;
-       unsigned bpp;
-       unsigned swizzle_mode;
-       bool rb_aligned;
-       bool pipe_aligned;
+   enum radeon_family family;
+   unsigned bpp;
+   unsigned swizzle_mode;
+   bool rb_aligned;
+   bool pipe_aligned;
 };
 
 struct dcc_retile_tile_data {
-       unsigned tile_width_log2;
-       unsigned tile_height_log2;
-       uint16_t *data;
+   unsigned tile_width_log2;
+   unsigned tile_height_log2;
+   uint16_t *data;
 };
 
 static uint32_t dcc_retile_tile_hash_key(const void *key)
 {
-       return _mesa_hash_data(key, sizeof(struct dcc_retile_tile_key));
+   return _mesa_hash_data(key, sizeof(struct dcc_retile_tile_key));
 }
 
 static bool dcc_retile_tile_keys_equal(const void *a, const void *b)
 {
-       return memcmp(a, b, sizeof(struct dcc_retile_tile_key)) == 0;
+   return memcmp(a, b, sizeof(struct dcc_retile_tile_key)) == 0;
 }
 
 static void dcc_retile_tile_free(struct hash_entry *entry)
 {
-       free((void*)entry->key);
-       free(((struct dcc_retile_tile_data*)entry->data)->data);
-       free(entry->data);
+   free((void *)entry->key);
+   free(((struct dcc_retile_tile_data *)entry->data)->data);
+   free(entry->data);
 }
 
 /* Assumes dcc_retile_map_lock is taken. */
 static const struct dcc_retile_tile_data *
-ac_compute_dcc_retile_tile_indices(struct ac_addrlib *addrlib,
-                                   const struct radeon_info *info,
-                                   unsigned bpp, unsigned swizzle_mode,
-                                   bool rb_aligned, bool pipe_aligned)
+ac_compute_dcc_retile_tile_indices(struct ac_addrlib *addrlib, const struct radeon_info *info,
+                                   unsigned bpp, unsigned swizzle_mode, bool rb_aligned,
+                                   bool pipe_aligned)
 {
-       struct dcc_retile_tile_key key = (struct dcc_retile_tile_key) {
-               .family = info->family,
-               .bpp = bpp,
-               .swizzle_mode = swizzle_mode,
-               .rb_aligned = rb_aligned,
-               .pipe_aligned = pipe_aligned
-       };
-
-       struct hash_entry *entry = _mesa_hash_table_search(addrlib->dcc_retile_tile_indices, &key);
-       if (entry)
-               return entry->data;
-
-       ADDR2_COMPUTE_DCCINFO_INPUT din = {0};
-       ADDR2_COMPUTE_DCCINFO_OUTPUT dout = {0};
-       din.size = sizeof(ADDR2_COMPUTE_DCCINFO_INPUT);
-       dout.size = sizeof(ADDR2_COMPUTE_DCCINFO_OUTPUT);
-
-       din.dccKeyFlags.pipeAligned = pipe_aligned;
-       din.dccKeyFlags.rbAligned = rb_aligned;
-       din.resourceType = ADDR_RSRC_TEX_2D;
-       din.swizzleMode = swizzle_mode;
-       din.bpp = bpp;
-       din.unalignedWidth = 1;
-       din.unalignedHeight = 1;
-       din.numSlices = 1;
-       din.numFrags = 1;
-       din.numMipLevels = 1;
-
-       ADDR_E_RETURNCODE ret = Addr2ComputeDccInfo(addrlib->handle, &din, &dout);
-       if (ret != ADDR_OK)
-               return NULL;
-
-       ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT addrin = {0};
-       addrin.size = sizeof(addrin);
-       addrin.swizzleMode = swizzle_mode;
-       addrin.resourceType = ADDR_RSRC_TEX_2D;
-       addrin.bpp = bpp;
-       addrin.numSlices = 1;
-       addrin.numMipLevels = 1;
-       addrin.numFrags = 1;
-       addrin.pitch = dout.pitch;
-       addrin.height = dout.height;
-       addrin.compressBlkWidth = dout.compressBlkWidth;
-       addrin.compressBlkHeight = dout.compressBlkHeight;
-       addrin.compressBlkDepth = dout.compressBlkDepth;
-       addrin.metaBlkWidth = dout.metaBlkWidth;
-       addrin.metaBlkHeight = dout.metaBlkHeight;
-       addrin.metaBlkDepth = dout.metaBlkDepth;
-       addrin.dccKeyFlags.pipeAligned = pipe_aligned;
-       addrin.dccKeyFlags.rbAligned = rb_aligned;
-
-       unsigned w = dout.metaBlkWidth / dout.compressBlkWidth;
-       unsigned h = dout.metaBlkHeight / dout.compressBlkHeight;
-       uint16_t *indices = malloc(w * h * sizeof (uint16_t));
-       if (!indices)
-               return NULL;
-
-       ADDR2_COMPUTE_DCC_ADDRFROMCOORD_OUTPUT addrout = {};
-       addrout.size = sizeof(addrout);
-
-       for (unsigned y = 0; y < h; ++y) {
-               addrin.y = y * dout.compressBlkHeight;
-               for (unsigned x = 0; x < w; ++x) {
-                       addrin.x = x * dout.compressBlkWidth;
-                       addrout.addr = 0;
-
-                       if (Addr2ComputeDccAddrFromCoord(addrlib->handle, &addrin, &addrout) != ADDR_OK) {
-                               free(indices);
-                               return NULL;
-                       }
-                       indices[y * w + x] = addrout.addr;
-               }
-       }
-
-       struct dcc_retile_tile_data *data = calloc(1, sizeof(*data));
-       if (!data) {
-               free(indices);
-               return NULL;
-       }
-
-       data->tile_width_log2 = util_logbase2(w);
-       data->tile_height_log2 = util_logbase2(h);
-       data->data = indices;
-
-       struct dcc_retile_tile_key *heap_key = mem_dup(&key, sizeof(key));
-       if (!heap_key) {
-               free(data);
-               free(indices);
-               return NULL;
-       }
-
-       entry = _mesa_hash_table_insert(addrlib->dcc_retile_tile_indices, heap_key, data);
-       if (!entry) {
-               free(heap_key);
-               free(data);
-               free(indices);
-       }
-       return data;
+   struct dcc_retile_tile_key key = (struct dcc_retile_tile_key){.family = info->family,
+                                                                 .bpp = bpp,
+                                                                 .swizzle_mode = swizzle_mode,
+                                                                 .rb_aligned = rb_aligned,
+                                                                 .pipe_aligned = pipe_aligned};
+
+   struct hash_entry *entry = _mesa_hash_table_search(addrlib->dcc_retile_tile_indices, &key);
+   if (entry)
+      return entry->data;
+
+   ADDR2_COMPUTE_DCCINFO_INPUT din = {0};
+   ADDR2_COMPUTE_DCCINFO_OUTPUT dout = {0};
+   din.size = sizeof(ADDR2_COMPUTE_DCCINFO_INPUT);
+   dout.size = sizeof(ADDR2_COMPUTE_DCCINFO_OUTPUT);
+
+   din.dccKeyFlags.pipeAligned = pipe_aligned;
+   din.dccKeyFlags.rbAligned = rb_aligned;
+   din.resourceType = ADDR_RSRC_TEX_2D;
+   din.swizzleMode = swizzle_mode;
+   din.bpp = bpp;
+   din.unalignedWidth = 1;
+   din.unalignedHeight = 1;
+   din.numSlices = 1;
+   din.numFrags = 1;
+   din.numMipLevels = 1;
+
+   ADDR_E_RETURNCODE ret = Addr2ComputeDccInfo(addrlib->handle, &din, &dout);
+   if (ret != ADDR_OK)
+      return NULL;
+
+   ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT addrin = {0};
+   addrin.size = sizeof(addrin);
+   addrin.swizzleMode = swizzle_mode;
+   addrin.resourceType = ADDR_RSRC_TEX_2D;
+   addrin.bpp = bpp;
+   addrin.numSlices = 1;
+   addrin.numMipLevels = 1;
+   addrin.numFrags = 1;
+   addrin.pitch = dout.pitch;
+   addrin.height = dout.height;
+   addrin.compressBlkWidth = dout.compressBlkWidth;
+   addrin.compressBlkHeight = dout.compressBlkHeight;
+   addrin.compressBlkDepth = dout.compressBlkDepth;
+   addrin.metaBlkWidth = dout.metaBlkWidth;
+   addrin.metaBlkHeight = dout.metaBlkHeight;
+   addrin.metaBlkDepth = dout.metaBlkDepth;
+   addrin.dccKeyFlags.pipeAligned = pipe_aligned;
+   addrin.dccKeyFlags.rbAligned = rb_aligned;
+
+   unsigned w = dout.metaBlkWidth / dout.compressBlkWidth;
+   unsigned h = dout.metaBlkHeight / dout.compressBlkHeight;
+   uint16_t *indices = malloc(w * h * sizeof(uint16_t));
+   if (!indices)
+      return NULL;
+
+   ADDR2_COMPUTE_DCC_ADDRFROMCOORD_OUTPUT addrout = {};
+   addrout.size = sizeof(addrout);
+
+   for (unsigned y = 0; y < h; ++y) {
+      addrin.y = y * dout.compressBlkHeight;
+      for (unsigned x = 0; x < w; ++x) {
+         addrin.x = x * dout.compressBlkWidth;
+         addrout.addr = 0;
+
+         if (Addr2ComputeDccAddrFromCoord(addrlib->handle, &addrin, &addrout) != ADDR_OK) {
+            free(indices);
+            return NULL;
+         }
+         indices[y * w + x] = addrout.addr;
+      }
+   }
+
+   struct dcc_retile_tile_data *data = calloc(1, sizeof(*data));
+   if (!data) {
+      free(indices);
+      return NULL;
+   }
+
+   data->tile_width_log2 = util_logbase2(w);
+   data->tile_height_log2 = util_logbase2(h);
+   data->data = indices;
+
+   struct dcc_retile_tile_key *heap_key = mem_dup(&key, sizeof(key));
+   if (!heap_key) {
+      free(data);
+      free(indices);
+      return NULL;
+   }
+
+   entry = _mesa_hash_table_insert(addrlib->dcc_retile_tile_indices, heap_key, data);
+   if (!entry) {
+      free(heap_key);
+      free(data);
+      free(indices);
+   }
+   return data;
 }
 
 static uint32_t ac_compute_retile_tile_addr(const struct dcc_retile_tile_data *tile,
                                             unsigned stride, unsigned x, unsigned y)
 {
-       unsigned x_mask = (1u << tile->tile_width_log2) - 1;
-       unsigned y_mask = (1u << tile->tile_height_log2) - 1;
-       unsigned tile_size_log2 = tile->tile_width_log2 + tile->tile_height_log2;
-
-       unsigned base = ((y >> tile->tile_height_log2) * stride + (x >> tile->tile_width_log2)) << tile_size_log2;
-       unsigned offset_in_tile = tile->data[((y & y_mask) << tile->tile_width_log2) + (x & x_mask)];
-       return base + offset_in_tile;
+   unsigned x_mask = (1u << tile->tile_width_log2) - 1;
+   unsigned y_mask = (1u << tile->tile_height_log2) - 1;
+   unsigned tile_size_log2 = tile->tile_width_log2 + tile->tile_height_log2;
+
+   unsigned base = ((y >> tile->tile_height_log2) * stride + (x >> tile->tile_width_log2))
+                   << tile_size_log2;
+   unsigned offset_in_tile = tile->data[((y & y_mask) << tile->tile_width_log2) + (x & x_mask)];
+   return base + offset_in_tile;
 }
 
 static uint32_t *ac_compute_dcc_retile_map(struct ac_addrlib *addrlib,
-                                          const struct radeon_info *info,
-                                          unsigned retile_width, unsigned retile_height,
-                                          bool rb_aligned, bool pipe_aligned, bool use_uint16,
-                                          unsigned dcc_retile_num_elements,
-                                          const ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT *in)
+                                           const struct radeon_info *info, unsigned retile_width,
+                                           unsigned retile_height, bool rb_aligned,
+                                           bool pipe_aligned, bool use_uint16,
+                                           unsigned dcc_retile_num_elements,
+                                           const ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT *in)
 {
-       unsigned dcc_retile_map_size = dcc_retile_num_elements * (use_uint16 ? 2 : 4);
-       struct dcc_retile_map_key key;
-
-       assert(in->numFrags == 1 && in->numSlices == 1 && in->numMipLevels == 1);
-
-       memset(&key, 0, sizeof(key));
-       key.family = info->family;
-       key.retile_width = retile_width;
-       key.retile_height = retile_height;
-       key.rb_aligned = rb_aligned;
-       key.pipe_aligned = pipe_aligned;
-       key.dcc_retile_num_elements = dcc_retile_num_elements;
-       memcpy(&key.input, in, sizeof(*in));
-
-       simple_mtx_lock(&addrlib->dcc_retile_map_lock);
-
-       /* If we have already computed this retile map, get it from the hash table. */
-       struct hash_entry *entry = _mesa_hash_table_search(addrlib->dcc_retile_maps, &key);
-       if (entry) {
-               uint32_t *map = entry->data;
-               simple_mtx_unlock(&addrlib->dcc_retile_map_lock);
-               return map;
-       }
-
-       const struct dcc_retile_tile_data *src_tile =
-               ac_compute_dcc_retile_tile_indices(addrlib, info, in->bpp,
-                                                  in->swizzleMode,
-                                                  rb_aligned, pipe_aligned);
-       const struct dcc_retile_tile_data *dst_tile =
-               ac_compute_dcc_retile_tile_indices(addrlib, info, in->bpp,
-                                                  in->swizzleMode, false, false);
-       if (!src_tile || !dst_tile) {
-               simple_mtx_unlock(&addrlib->dcc_retile_map_lock);
-               return NULL;
-       }
-
-       void *dcc_retile_map = malloc(dcc_retile_map_size);
-       if (!dcc_retile_map) {
-               simple_mtx_unlock(&addrlib->dcc_retile_map_lock);
-               return NULL;
-       }
-
-       unsigned index = 0;
-       unsigned w = DIV_ROUND_UP(retile_width, in->compressBlkWidth);
-       unsigned h = DIV_ROUND_UP(retile_height, in->compressBlkHeight);
-       unsigned src_stride = DIV_ROUND_UP(w, 1u << src_tile->tile_width_log2);
-       unsigned dst_stride = DIV_ROUND_UP(w, 1u << dst_tile->tile_width_log2);
-
-       for (unsigned y = 0; y < h; ++y) {
-               for (unsigned x = 0; x < w; ++x) {
-                       unsigned src_addr = ac_compute_retile_tile_addr(src_tile, src_stride, x, y);
-                       unsigned dst_addr = ac_compute_retile_tile_addr(dst_tile, dst_stride, x, y);
-
-                       if (use_uint16) {
-                               ((uint16_t*)dcc_retile_map)[2 * index] = src_addr;
-                               ((uint16_t*)dcc_retile_map)[2 * index + 1] = dst_addr;
-                       } else {
-                               ((uint32_t*)dcc_retile_map)[2 * index] = src_addr;
-                               ((uint32_t*)dcc_retile_map)[2 * index + 1] = dst_addr;
-                       }
-                       ++index;
-               }
-       }
-
-       /* Fill the remaining pairs with the last one (for the compute shader). */
-       for (unsigned i = index * 2; i < dcc_retile_num_elements; i++) {
-               if (use_uint16)
-                       ((uint16_t*)dcc_retile_map)[i] = ((uint16_t*)dcc_retile_map)[i - 2];
-               else
-                       ((uint32_t*)dcc_retile_map)[i] = ((uint32_t*)dcc_retile_map)[i - 2];
-       }
-
-       /* Insert the retile map into the hash table, so that it can be reused and
-        * the computation can be skipped for similar image sizes.
-        */
-       _mesa_hash_table_insert(addrlib->dcc_retile_maps,
-                               mem_dup(&key, sizeof(key)), dcc_retile_map);
-
-       simple_mtx_unlock(&addrlib->dcc_retile_map_lock);
-       return dcc_retile_map;
+   unsigned dcc_retile_map_size = dcc_retile_num_elements * (use_uint16 ? 2 : 4);
+   struct dcc_retile_map_key key;
+
+   assert(in->numFrags == 1 && in->numSlices == 1 && in->numMipLevels == 1);
+
+   memset(&key, 0, sizeof(key));
+   key.family = info->family;
+   key.retile_width = retile_width;
+   key.retile_height = retile_height;
+   key.rb_aligned = rb_aligned;
+   key.pipe_aligned = pipe_aligned;
+   key.dcc_retile_num_elements = dcc_retile_num_elements;
+   memcpy(&key.input, in, sizeof(*in));
+
+   simple_mtx_lock(&addrlib->dcc_retile_map_lock);
+
+   /* If we have already computed this retile map, get it from the hash table. */
+   struct hash_entry *entry = _mesa_hash_table_search(addrlib->dcc_retile_maps, &key);
+   if (entry) {
+      uint32_t *map = entry->data;
+      simple_mtx_unlock(&addrlib->dcc_retile_map_lock);
+      return map;
+   }
+
+   const struct dcc_retile_tile_data *src_tile = ac_compute_dcc_retile_tile_indices(
+      addrlib, info, in->bpp, in->swizzleMode, rb_aligned, pipe_aligned);
+   const struct dcc_retile_tile_data *dst_tile =
+      ac_compute_dcc_retile_tile_indices(addrlib, info, in->bpp, in->swizzleMode, false, false);
+   if (!src_tile || !dst_tile) {
+      simple_mtx_unlock(&addrlib->dcc_retile_map_lock);
+      return NULL;
+   }
+
+   void *dcc_retile_map = malloc(dcc_retile_map_size);
+   if (!dcc_retile_map) {
+      simple_mtx_unlock(&addrlib->dcc_retile_map_lock);
+      return NULL;
+   }
+
+   unsigned index = 0;
+   unsigned w = DIV_ROUND_UP(retile_width, in->compressBlkWidth);
+   unsigned h = DIV_ROUND_UP(retile_height, in->compressBlkHeight);
+   unsigned src_stride = DIV_ROUND_UP(w, 1u << src_tile->tile_width_log2);
+   unsigned dst_stride = DIV_ROUND_UP(w, 1u << dst_tile->tile_width_log2);
+
+   for (unsigned y = 0; y < h; ++y) {
+      for (unsigned x = 0; x < w; ++x) {
+         unsigned src_addr = ac_compute_retile_tile_addr(src_tile, src_stride, x, y);
+         unsigned dst_addr = ac_compute_retile_tile_addr(dst_tile, dst_stride, x, y);
+
+         if (use_uint16) {
+            ((uint16_t *)dcc_retile_map)[2 * index] = src_addr;
+            ((uint16_t *)dcc_retile_map)[2 * index + 1] = dst_addr;
+         } else {
+            ((uint32_t *)dcc_retile_map)[2 * index] = src_addr;
+            ((uint32_t *)dcc_retile_map)[2 * index + 1] = dst_addr;
+         }
+         ++index;
+      }
+   }
+
+   /* Fill the remaining pairs with the last one (for the compute shader). */
+   for (unsigned i = index * 2; i < dcc_retile_num_elements; i++) {
+      if (use_uint16)
+         ((uint16_t *)dcc_retile_map)[i] = ((uint16_t *)dcc_retile_map)[i - 2];
+      else
+         ((uint32_t *)dcc_retile_map)[i] = ((uint32_t *)dcc_retile_map)[i - 2];
+   }
+
+   /* Insert the retile map into the hash table, so that it can be reused and
+    * the computation can be skipped for similar image sizes.
+    */
+   _mesa_hash_table_insert(addrlib->dcc_retile_maps, mem_dup(&key, sizeof(key)), dcc_retile_map);
+
+   simple_mtx_unlock(&addrlib->dcc_retile_map_lock);
+   return dcc_retile_map;
 }
 
-static void *ADDR_API allocSysMem(const ADDR_ALLOCSYSMEM_INPUT * pInput)
+static void *ADDR_API allocSysMem(const ADDR_ALLOCSYSMEM_INPUT *pInput)
 {
-       return malloc(pInput->sizeInBytes);
+   return malloc(pInput->sizeInBytes);
 }
 
-static ADDR_E_RETURNCODE ADDR_API freeSysMem(const ADDR_FREESYSMEM_INPUT * pInput)
+static ADDR_E_RETURNCODE ADDR_API freeSysMem(const ADDR_FREESYSMEM_INPUT *pInput)
 {
-       free(pInput->pVirtAddr);
-       return ADDR_OK;
+   free(pInput->pVirtAddr);
+   return ADDR_OK;
 }
 
 struct ac_addrlib *ac_addrlib_create(const struct radeon_info *info,
-                                    const struct amdgpu_gpu_info *amdinfo,
-                                    uint64_t *max_alignment)
+                                     const struct amdgpu_gpu_info *amdinfo, uint64_t *max_alignment)
 {
-       ADDR_CREATE_INPUT addrCreateInput = {0};
-       ADDR_CREATE_OUTPUT addrCreateOutput = {0};
-       ADDR_REGISTER_VALUE regValue = {0};
-       ADDR_CREATE_FLAGS createFlags = {{0}};
-       ADDR_GET_MAX_ALIGNMENTS_OUTPUT addrGetMaxAlignmentsOutput = {0};
-       ADDR_E_RETURNCODE addrRet;
-
-       addrCreateInput.size = sizeof(ADDR_CREATE_INPUT);
-       addrCreateOutput.size = sizeof(ADDR_CREATE_OUTPUT);
-
-       regValue.gbAddrConfig = amdinfo->gb_addr_cfg;
-       createFlags.value = 0;
-
-       addrCreateInput.chipFamily = info->family_id;
-       addrCreateInput.chipRevision = info->chip_external_rev;
-
-       if (addrCreateInput.chipFamily == FAMILY_UNKNOWN)
-               return NULL;
-
-       if (addrCreateInput.chipFamily >= FAMILY_AI) {
-               addrCreateInput.chipEngine = CIASICIDGFXENGINE_ARCTICISLAND;
-       } else {
-               regValue.noOfBanks = amdinfo->mc_arb_ramcfg & 0x3;
-               regValue.noOfRanks = (amdinfo->mc_arb_ramcfg & 0x4) >> 2;
-
-               regValue.backendDisables = amdinfo->enabled_rb_pipes_mask;
-               regValue.pTileConfig = amdinfo->gb_tile_mode;
-               regValue.noOfEntries = ARRAY_SIZE(amdinfo->gb_tile_mode);
-               if (addrCreateInput.chipFamily == FAMILY_SI) {
-                       regValue.pMacroTileConfig = NULL;
-                       regValue.noOfMacroEntries = 0;
-               } else {
-                       regValue.pMacroTileConfig = amdinfo->gb_macro_tile_mode;
-                       regValue.noOfMacroEntries = ARRAY_SIZE(amdinfo->gb_macro_tile_mode);
-               }
-
-               createFlags.useTileIndex = 1;
-               createFlags.useHtileSliceAlign = 1;
-
-               addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND;
-       }
-
-       addrCreateInput.callbacks.allocSysMem = allocSysMem;
-       addrCreateInput.callbacks.freeSysMem = freeSysMem;
-       addrCreateInput.callbacks.debugPrint = 0;
-       addrCreateInput.createFlags = createFlags;
-       addrCreateInput.regValue = regValue;
-
-       addrRet = AddrCreate(&addrCreateInput, &addrCreateOutput);
-       if (addrRet != ADDR_OK)
-               return NULL;
-
-       if (max_alignment) {
-               addrRet = AddrGetMaxAlignments(addrCreateOutput.hLib, &addrGetMaxAlignmentsOutput);
-               if (addrRet == ADDR_OK){
-                       *max_alignment = addrGetMaxAlignmentsOutput.baseAlign;
-               }
-       }
-
-       struct ac_addrlib *addrlib = calloc(1, sizeof(struct ac_addrlib));
-       if (!addrlib) {
-               AddrDestroy(addrCreateOutput.hLib);
-               return NULL;
-       }
-
-       addrlib->handle = addrCreateOutput.hLib;
-       simple_mtx_init(&addrlib->dcc_retile_map_lock, mtx_plain);
-       addrlib->dcc_retile_maps = _mesa_hash_table_create(NULL, dcc_retile_map_hash_key,
-                                                          dcc_retile_map_keys_equal);
-       addrlib->dcc_retile_tile_indices = _mesa_hash_table_create(NULL, dcc_retile_tile_hash_key,
-                                                                  dcc_retile_tile_keys_equal);
-       return addrlib;
+   ADDR_CREATE_INPUT addrCreateInput = {0};
+   ADDR_CREATE_OUTPUT addrCreateOutput = {0};
+   ADDR_REGISTER_VALUE regValue = {0};
+   ADDR_CREATE_FLAGS createFlags = {{0}};
+   ADDR_GET_MAX_ALIGNMENTS_OUTPUT addrGetMaxAlignmentsOutput = {0};
+   ADDR_E_RETURNCODE addrRet;
+
+   addrCreateInput.size = sizeof(ADDR_CREATE_INPUT);
+   addrCreateOutput.size = sizeof(ADDR_CREATE_OUTPUT);
+
+   regValue.gbAddrConfig = amdinfo->gb_addr_cfg;
+   createFlags.value = 0;
+
+   addrCreateInput.chipFamily = info->family_id;
+   addrCreateInput.chipRevision = info->chip_external_rev;
+
+   if (addrCreateInput.chipFamily == FAMILY_UNKNOWN)
+      return NULL;
+
+   if (addrCreateInput.chipFamily >= FAMILY_AI) {
+      addrCreateInput.chipEngine = CIASICIDGFXENGINE_ARCTICISLAND;
+   } else {
+      regValue.noOfBanks = amdinfo->mc_arb_ramcfg & 0x3;
+      regValue.noOfRanks = (amdinfo->mc_arb_ramcfg & 0x4) >> 2;
+
+      regValue.backendDisables = amdinfo->enabled_rb_pipes_mask;
+      regValue.pTileConfig = amdinfo->gb_tile_mode;
+      regValue.noOfEntries = ARRAY_SIZE(amdinfo->gb_tile_mode);
+      if (addrCreateInput.chipFamily == FAMILY_SI) {
+         regValue.pMacroTileConfig = NULL;
+         regValue.noOfMacroEntries = 0;
+      } else {
+         regValue.pMacroTileConfig = amdinfo->gb_macro_tile_mode;
+         regValue.noOfMacroEntries = ARRAY_SIZE(amdinfo->gb_macro_tile_mode);
+      }
+
+      createFlags.useTileIndex = 1;
+      createFlags.useHtileSliceAlign = 1;
+
+      addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND;
+   }
+
+   addrCreateInput.callbacks.allocSysMem = allocSysMem;
+   addrCreateInput.callbacks.freeSysMem = freeSysMem;
+   addrCreateInput.callbacks.debugPrint = 0;
+   addrCreateInput.createFlags = createFlags;
+   addrCreateInput.regValue = regValue;
+
+   addrRet = AddrCreate(&addrCreateInput, &addrCreateOutput);
+   if (addrRet != ADDR_OK)
+      return NULL;
+
+   if (max_alignment) {
+      addrRet = AddrGetMaxAlignments(addrCreateOutput.hLib, &addrGetMaxAlignmentsOutput);
+      if (addrRet == ADDR_OK) {
+         *max_alignment = addrGetMaxAlignmentsOutput.baseAlign;
+      }
+   }
+
+   struct ac_addrlib *addrlib = calloc(1, sizeof(struct ac_addrlib));
+   if (!addrlib) {
+      AddrDestroy(addrCreateOutput.hLib);
+      return NULL;
+   }
+
+   addrlib->handle = addrCreateOutput.hLib;
+   simple_mtx_init(&addrlib->dcc_retile_map_lock, mtx_plain);
+   addrlib->dcc_retile_maps =
+      _mesa_hash_table_create(NULL, dcc_retile_map_hash_key, dcc_retile_map_keys_equal);
+   addrlib->dcc_retile_tile_indices =
+      _mesa_hash_table_create(NULL, dcc_retile_tile_hash_key, dcc_retile_tile_keys_equal);
+   return addrlib;
 }
 
 void ac_addrlib_destroy(struct ac_addrlib *addrlib)
 {
-       AddrDestroy(addrlib->handle);
-       simple_mtx_destroy(&addrlib->dcc_retile_map_lock);
-       _mesa_hash_table_destroy(addrlib->dcc_retile_maps, dcc_retile_map_free);
-       _mesa_hash_table_destroy(addrlib->dcc_retile_tile_indices, dcc_retile_tile_free);
-       free(addrlib);
+   AddrDestroy(addrlib->handle);
+   simple_mtx_destroy(&addrlib->dcc_retile_map_lock);
+   _mesa_hash_table_destroy(addrlib->dcc_retile_maps, dcc_retile_map_free);
+   _mesa_hash_table_destroy(addrlib->dcc_retile_tile_indices, dcc_retile_tile_free);
+   free(addrlib);
 }
 
-static int surf_config_sanity(const struct ac_surf_config *config,
-                             unsigned flags)
+static int surf_config_sanity(const struct ac_surf_config *config, unsigned flags)
 {
-       /* FMASK is allocated together with the color surface and can't be
-        * allocated separately.
-        */
-       assert(!(flags & RADEON_SURF_FMASK));
-       if (flags & RADEON_SURF_FMASK)
-               return -EINVAL;
-
-       /* all dimension must be at least 1 ! */
-       if (!config->info.width || !config->info.height || !config->info.depth ||
-           !config->info.array_size || !config->info.levels)
-               return -EINVAL;
-
-       switch (config->info.samples) {
-       case 0:
-       case 1:
-       case 2:
-       case 4:
-       case 8:
-               break;
-       case 16:
-               if (flags & RADEON_SURF_Z_OR_SBUFFER)
-                       return -EINVAL;
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       if (!(flags & RADEON_SURF_Z_OR_SBUFFER)) {
-               switch (config->info.storage_samples) {
-               case 0:
-               case 1:
-               case 2:
-               case 4:
-               case 8:
-                       break;
-               default:
-                       return -EINVAL;
-               }
-       }
-
-       if (config->is_3d && config->info.array_size > 1)
-               return -EINVAL;
-       if (config->is_cube && config->info.depth > 1)
-               return -EINVAL;
-
-       return 0;
+   /* FMASK is allocated together with the color surface and can't be
+    * allocated separately.
+    */
+   assert(!(flags & RADEON_SURF_FMASK));
+   if (flags & RADEON_SURF_FMASK)
+      return -EINVAL;
+
+   /* all dimension must be at least 1 ! */
+   if (!config->info.width || !config->info.height || !config->info.depth ||
+       !config->info.array_size || !config->info.levels)
+      return -EINVAL;
+
+   switch (config->info.samples) {
+   case 0:
+   case 1:
+   case 2:
+   case 4:
+   case 8:
+      break;
+   case 16:
+      if (flags & RADEON_SURF_Z_OR_SBUFFER)
+         return -EINVAL;
+      break;
+   default:
+      return -EINVAL;
+   }
+
+   if (!(flags & RADEON_SURF_Z_OR_SBUFFER)) {
+      switch (config->info.storage_samples) {
+      case 0:
+      case 1:
+      case 2:
+      case 4:
+      case 8:
+         break;
+      default:
+         return -EINVAL;
+      }
+   }
+
+   if (config->is_3d && config->info.array_size > 1)
+      return -EINVAL;
+   if (config->is_cube && config->info.depth > 1)
+      return -EINVAL;
+
+   return 0;
 }
 
-static int gfx6_compute_level(ADDR_HANDLE addrlib,
-                             const struct ac_surf_config *config,
-                             struct radeon_surf *surf, bool is_stencil,
-                             unsigned level, bool compressed,
-                             ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn,
-                             ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut,
-                             ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn,
-                             ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut,
-                             ADDR_COMPUTE_HTILE_INFO_INPUT *AddrHtileIn,
-                             ADDR_COMPUTE_HTILE_INFO_OUTPUT *AddrHtileOut)
+static int gfx6_compute_level(ADDR_HANDLE addrlib, const struct ac_surf_config *config,
+                              struct radeon_surf *surf, bool is_stencil, unsigned level,
+                              bool compressed, ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn,
+                              ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut,
+                              ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn,
+                              ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut,
+                              ADDR_COMPUTE_HTILE_INFO_INPUT *AddrHtileIn,
+                              ADDR_COMPUTE_HTILE_INFO_OUTPUT *AddrHtileOut)
 {
-       struct legacy_surf_level *surf_level;
-       ADDR_E_RETURNCODE ret;
-
-       AddrSurfInfoIn->mipLevel = level;
-       AddrSurfInfoIn->width = u_minify(config->info.width, level);
-       AddrSurfInfoIn->height = u_minify(config->info.height, level);
-
-       /* Make GFX6 linear surfaces compatible with GFX9 for hybrid graphics,
-        * because GFX9 needs linear alignment of 256 bytes.
-        */
-       if (config->info.levels == 1 &&
-           AddrSurfInfoIn->tileMode == ADDR_TM_LINEAR_ALIGNED &&
-           AddrSurfInfoIn->bpp &&
-           util_is_power_of_two_or_zero(AddrSurfInfoIn->bpp)) {
-               unsigned alignment = 256 / (AddrSurfInfoIn->bpp / 8);
-
-               AddrSurfInfoIn->width = align(AddrSurfInfoIn->width, alignment);
-       }
-
-       /* addrlib assumes the bytes/pixel is a divisor of 64, which is not
-        * true for r32g32b32 formats. */
-       if (AddrSurfInfoIn->bpp == 96) {
-               assert(config->info.levels == 1);
-               assert(AddrSurfInfoIn->tileMode == ADDR_TM_LINEAR_ALIGNED);
-
-               /* The least common multiple of 64 bytes and 12 bytes/pixel is
-                * 192 bytes, or 16 pixels. */
-               AddrSurfInfoIn->width = align(AddrSurfInfoIn->width, 16);
-       }
-
-       if (config->is_3d)
-               AddrSurfInfoIn->numSlices = u_minify(config->info.depth, level);
-       else if (config->is_cube)
-               AddrSurfInfoIn->numSlices = 6;
-       else
-               AddrSurfInfoIn->numSlices = config->info.array_size;
-
-       if (level > 0) {
-               /* Set the base level pitch. This is needed for calculation
-                * of non-zero levels. */
-               if (is_stencil)
-                       AddrSurfInfoIn->basePitch = surf->u.legacy.stencil_level[0].nblk_x;
-               else
-                       AddrSurfInfoIn->basePitch = surf->u.legacy.level[0].nblk_x;
-
-               /* Convert blocks to pixels for compressed formats. */
-               if (compressed)
-                       AddrSurfInfoIn->basePitch *= surf->blk_w;
-       }
-
-       ret = AddrComputeSurfaceInfo(addrlib,
-                                    AddrSurfInfoIn,
-                                    AddrSurfInfoOut);
-       if (ret != ADDR_OK) {
-               return ret;
-       }
-
-       surf_level = is_stencil ? &surf->u.legacy.stencil_level[level] : &surf->u.legacy.level[level];
-       surf_level->offset = align64(surf->surf_size, AddrSurfInfoOut->baseAlign);
-       surf_level->slice_size_dw = AddrSurfInfoOut->sliceSize / 4;
-       surf_level->nblk_x = AddrSurfInfoOut->pitch;
-       surf_level->nblk_y = AddrSurfInfoOut->height;
-
-       switch (AddrSurfInfoOut->tileMode) {
-       case ADDR_TM_LINEAR_ALIGNED:
-               surf_level->mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
-               break;
-       case ADDR_TM_1D_TILED_THIN1:
-               surf_level->mode = RADEON_SURF_MODE_1D;
-               break;
-       case ADDR_TM_2D_TILED_THIN1:
-               surf_level->mode = RADEON_SURF_MODE_2D;
-               break;
-       default:
-               assert(0);
-       }
-
-       if (is_stencil)
-               surf->u.legacy.stencil_tiling_index[level] = AddrSurfInfoOut->tileIndex;
-       else
-               surf->u.legacy.tiling_index[level] = AddrSurfInfoOut->tileIndex;
-
-       surf->surf_size = surf_level->offset + AddrSurfInfoOut->surfSize;
-
-       /* Clear DCC fields at the beginning. */
-       surf_level->dcc_offset = 0;
-
-       /* The previous level's flag tells us if we can use DCC for this level. */
-       if (AddrSurfInfoIn->flags.dccCompatible &&
-           (level == 0 || AddrDccOut->subLvlCompressible)) {
-               bool prev_level_clearable = level == 0 ||
-                                           AddrDccOut->dccRamSizeAligned;
-
-               AddrDccIn->colorSurfSize = AddrSurfInfoOut->surfSize;
-               AddrDccIn->tileMode = AddrSurfInfoOut->tileMode;
-               AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo;
-               AddrDccIn->tileIndex = AddrSurfInfoOut->tileIndex;
-               AddrDccIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
-
-               ret = AddrComputeDccInfo(addrlib,
-                                        AddrDccIn,
-                                        AddrDccOut);
-
-               if (ret == ADDR_OK) {
-                       surf_level->dcc_offset = surf->dcc_size;
-                       surf->num_dcc_levels = level + 1;
-                       surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize;
-                       surf->dcc_alignment = MAX2(surf->dcc_alignment, AddrDccOut->dccRamBaseAlign);
-
-                       /* If the DCC size of a subresource (1 mip level or 1 slice)
-                        * is not aligned, the DCC memory layout is not contiguous for
-                        * that subresource, which means we can't use fast clear.
-                        *
-                        * We only do fast clears for whole mipmap levels. If we did
-                        * per-slice fast clears, the same restriction would apply.
-                        * (i.e. only compute the slice size and see if it's aligned)
-                        *
-                        * The last level can be non-contiguous and still be clearable
-                        * if it's interleaved with the next level that doesn't exist.
-                        */
-                       if (AddrDccOut->dccRamSizeAligned ||
-                           (prev_level_clearable && level == config->info.levels - 1))
-                               surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize;
-                       else
-                               surf_level->dcc_fast_clear_size = 0;
-
-                       /* Compute the DCC slice size because addrlib doesn't
-                        * provide this info. As DCC memory is linear (each
-                        * slice is the same size) it's easy to compute.
-                        */
-                       surf->dcc_slice_size = AddrDccOut->dccRamSize / config->info.array_size;
-
-                       /* For arrays, we have to compute the DCC info again
-                        * with one slice size to get a correct fast clear
-                        * size.
-                        */
-                       if (config->info.array_size > 1) {
-                               AddrDccIn->colorSurfSize = AddrSurfInfoOut->sliceSize;
-                               AddrDccIn->tileMode = AddrSurfInfoOut->tileMode;
-                               AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo;
-                               AddrDccIn->tileIndex = AddrSurfInfoOut->tileIndex;
-                               AddrDccIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
-
-                               ret = AddrComputeDccInfo(addrlib,
-                                                        AddrDccIn, AddrDccOut);
-                               if (ret == ADDR_OK) {
-                                       /* If the DCC memory isn't properly
-                                        * aligned, the data are interleaved
-                                        * accross slices.
-                                        */
-                                       if (AddrDccOut->dccRamSizeAligned)
-                                               surf_level->dcc_slice_fast_clear_size = AddrDccOut->dccFastClearSize;
-                                       else
-                                               surf_level->dcc_slice_fast_clear_size = 0;
-                               }
-
-                               if (surf->flags & RADEON_SURF_CONTIGUOUS_DCC_LAYERS &&
-                                   surf->dcc_slice_size != surf_level->dcc_slice_fast_clear_size) {
-                                       surf->dcc_size = 0;
-                                       surf->num_dcc_levels = 0;
-                                       AddrDccOut->subLvlCompressible = false;
-                               }
-                       } else {
-                               surf_level->dcc_slice_fast_clear_size = surf_level->dcc_fast_clear_size;
-                       }
-               }
-       }
-
-       /* HTILE. */
-       if (!is_stencil &&
-           AddrSurfInfoIn->flags.depth &&
-           surf_level->mode == RADEON_SURF_MODE_2D &&
-           level == 0 &&
-           !(surf->flags & RADEON_SURF_NO_HTILE)) {
-               AddrHtileIn->flags.tcCompatible = AddrSurfInfoOut->tcCompatible;
-               AddrHtileIn->pitch = AddrSurfInfoOut->pitch;
-               AddrHtileIn->height = AddrSurfInfoOut->height;
-               AddrHtileIn->numSlices = AddrSurfInfoOut->depth;
-               AddrHtileIn->blockWidth = ADDR_HTILE_BLOCKSIZE_8;
-               AddrHtileIn->blockHeight = ADDR_HTILE_BLOCKSIZE_8;
-               AddrHtileIn->pTileInfo = AddrSurfInfoOut->pTileInfo;
-               AddrHtileIn->tileIndex = AddrSurfInfoOut->tileIndex;
-               AddrHtileIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
-
-               ret = AddrComputeHtileInfo(addrlib,
-                                          AddrHtileIn,
-                                          AddrHtileOut);
-
-               if (ret == ADDR_OK) {
-                       surf->htile_size = AddrHtileOut->htileBytes;
-                       surf->htile_slice_size = AddrHtileOut->sliceSize;
-                       surf->htile_alignment = AddrHtileOut->baseAlign;
-               }
-       }
-
-       return 0;
+   struct legacy_surf_level *surf_level;
+   ADDR_E_RETURNCODE ret;
+
+   AddrSurfInfoIn->mipLevel = level;
+   AddrSurfInfoIn->width = u_minify(config->info.width, level);
+   AddrSurfInfoIn->height = u_minify(config->info.height, level);
+
+   /* Make GFX6 linear surfaces compatible with GFX9 for hybrid graphics,
+    * because GFX9 needs linear alignment of 256 bytes.
+    */
+   if (config->info.levels == 1 && AddrSurfInfoIn->tileMode == ADDR_TM_LINEAR_ALIGNED &&
+       AddrSurfInfoIn->bpp && util_is_power_of_two_or_zero(AddrSurfInfoIn->bpp)) {
+      unsigned alignment = 256 / (AddrSurfInfoIn->bpp / 8);
+
+      AddrSurfInfoIn->width = align(AddrSurfInfoIn->width, alignment);
+   }
+
+   /* addrlib assumes the bytes/pixel is a divisor of 64, which is not
+    * true for r32g32b32 formats. */
+   if (AddrSurfInfoIn->bpp == 96) {
+      assert(config->info.levels == 1);
+      assert(AddrSurfInfoIn->tileMode == ADDR_TM_LINEAR_ALIGNED);
+
+      /* The least common multiple of 64 bytes and 12 bytes/pixel is
+       * 192 bytes, or 16 pixels. */
+      AddrSurfInfoIn->width = align(AddrSurfInfoIn->width, 16);
+   }
+
+   if (config->is_3d)
+      AddrSurfInfoIn->numSlices = u_minify(config->info.depth, level);
+   else if (config->is_cube)
+      AddrSurfInfoIn->numSlices = 6;
+   else
+      AddrSurfInfoIn->numSlices = config->info.array_size;
+
+   if (level > 0) {
+      /* Set the base level pitch. This is needed for calculation
+       * of non-zero levels. */
+      if (is_stencil)
+         AddrSurfInfoIn->basePitch = surf->u.legacy.stencil_level[0].nblk_x;
+      else
+         AddrSurfInfoIn->basePitch = surf->u.legacy.level[0].nblk_x;
+
+      /* Convert blocks to pixels for compressed formats. */
+      if (compressed)
+         AddrSurfInfoIn->basePitch *= surf->blk_w;
+   }
+
+   ret = AddrComputeSurfaceInfo(addrlib, AddrSurfInfoIn, AddrSurfInfoOut);
+   if (ret != ADDR_OK) {
+      return ret;
+   }
+
+   surf_level = is_stencil ? &surf->u.legacy.stencil_level[level] : &surf->u.legacy.level[level];
+   surf_level->offset = align64(surf->surf_size, AddrSurfInfoOut->baseAlign);
+   surf_level->slice_size_dw = AddrSurfInfoOut->sliceSize / 4;
+   surf_level->nblk_x = AddrSurfInfoOut->pitch;
+   surf_level->nblk_y = AddrSurfInfoOut->height;
+
+   switch (AddrSurfInfoOut->tileMode) {
+   case ADDR_TM_LINEAR_ALIGNED:
+      surf_level->mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+      break;
+   case ADDR_TM_1D_TILED_THIN1:
+      surf_level->mode = RADEON_SURF_MODE_1D;
+      break;
+   case ADDR_TM_2D_TILED_THIN1:
+      surf_level->mode = RADEON_SURF_MODE_2D;
+      break;
+   default:
+      assert(0);
+   }
+
+   if (is_stencil)
+      surf->u.legacy.stencil_tiling_index[level] = AddrSurfInfoOut->tileIndex;
+   else
+      surf->u.legacy.tiling_index[level] = AddrSurfInfoOut->tileIndex;
+
+   surf->surf_size = surf_level->offset + AddrSurfInfoOut->surfSize;
+
+   /* Clear DCC fields at the beginning. */
+   surf_level->dcc_offset = 0;
+
+   /* The previous level's flag tells us if we can use DCC for this level. */
+   if (AddrSurfInfoIn->flags.dccCompatible && (level == 0 || AddrDccOut->subLvlCompressible)) {
+      bool prev_level_clearable = level == 0 || AddrDccOut->dccRamSizeAligned;
+
+      AddrDccIn->colorSurfSize = AddrSurfInfoOut->surfSize;
+      AddrDccIn->tileMode = AddrSurfInfoOut->tileMode;
+      AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo;
+      AddrDccIn->tileIndex = AddrSurfInfoOut->tileIndex;
+      AddrDccIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
+
+      ret = AddrComputeDccInfo(addrlib, AddrDccIn, AddrDccOut);
+
+      if (ret == ADDR_OK) {
+         surf_level->dcc_offset = surf->dcc_size;
+         surf->num_dcc_levels = level + 1;
+         surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize;
+         surf->dcc_alignment = MAX2(surf->dcc_alignment, AddrDccOut->dccRamBaseAlign);
+
+         /* If the DCC size of a subresource (1 mip level or 1 slice)
+          * is not aligned, the DCC memory layout is not contiguous for
+          * that subresource, which means we can't use fast clear.
+          *
+          * We only do fast clears for whole mipmap levels. If we did
+          * per-slice fast clears, the same restriction would apply.
+          * (i.e. only compute the slice size and see if it's aligned)
+          *
+          * The last level can be non-contiguous and still be clearable
+          * if it's interleaved with the next level that doesn't exist.
+          */
+         if (AddrDccOut->dccRamSizeAligned ||
+             (prev_level_clearable && level == config->info.levels - 1))
+            surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize;
+         else
+            surf_level->dcc_fast_clear_size = 0;
+
+         /* Compute the DCC slice size because addrlib doesn't
+          * provide this info. As DCC memory is linear (each
+          * slice is the same size) it's easy to compute.
+          */
+         surf->dcc_slice_size = AddrDccOut->dccRamSize / config->info.array_size;
+
+         /* For arrays, we have to compute the DCC info again
+          * with one slice size to get a correct fast clear
+          * size.
+          */
+         if (config->info.array_size > 1) {
+            AddrDccIn->colorSurfSize = AddrSurfInfoOut->sliceSize;
+            AddrDccIn->tileMode = AddrSurfInfoOut->tileMode;
+            AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo;
+            AddrDccIn->tileIndex = AddrSurfInfoOut->tileIndex;
+            AddrDccIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
+
+            ret = AddrComputeDccInfo(addrlib, AddrDccIn, AddrDccOut);
+            if (ret == ADDR_OK) {
+               /* If the DCC memory isn't properly
+                * aligned, the data are interleaved
+                * accross slices.
+                */
+               if (AddrDccOut->dccRamSizeAligned)
+                  surf_level->dcc_slice_fast_clear_size = AddrDccOut->dccFastClearSize;
+               else
+                  surf_level->dcc_slice_fast_clear_size = 0;
+            }
+
+            if (surf->flags & RADEON_SURF_CONTIGUOUS_DCC_LAYERS &&
+                surf->dcc_slice_size != surf_level->dcc_slice_fast_clear_size) {
+               surf->dcc_size = 0;
+               surf->num_dcc_levels = 0;
+               AddrDccOut->subLvlCompressible = false;
+            }
+         } else {
+            surf_level->dcc_slice_fast_clear_size = surf_level->dcc_fast_clear_size;
+         }
+      }
+   }
+
+   /* HTILE. */
+   if (!is_stencil && AddrSurfInfoIn->flags.depth && surf_level->mode == RADEON_SURF_MODE_2D &&
+       level == 0 && !(surf->flags & RADEON_SURF_NO_HTILE)) {
+      AddrHtileIn->flags.tcCompatible = AddrSurfInfoOut->tcCompatible;
+      AddrHtileIn->pitch = AddrSurfInfoOut->pitch;
+      AddrHtileIn->height = AddrSurfInfoOut->height;
+      AddrHtileIn->numSlices = AddrSurfInfoOut->depth;
+      AddrHtileIn->blockWidth = ADDR_HTILE_BLOCKSIZE_8;
+      AddrHtileIn->blockHeight = ADDR_HTILE_BLOCKSIZE_8;
+      AddrHtileIn->pTileInfo = AddrSurfInfoOut->pTileInfo;
+      AddrHtileIn->tileIndex = AddrSurfInfoOut->tileIndex;
+      AddrHtileIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
+
+      ret = AddrComputeHtileInfo(addrlib, AddrHtileIn, AddrHtileOut);
+
+      if (ret == ADDR_OK) {
+         surf->htile_size = AddrHtileOut->htileBytes;
+         surf->htile_slice_size = AddrHtileOut->sliceSize;
+         surf->htile_alignment = AddrHtileOut->baseAlign;
+      }
+   }
+
+   return 0;
 }
 
-static void gfx6_set_micro_tile_mode(struct radeon_surf *surf,
-                                    const struct radeon_info *info)
+static void gfx6_set_micro_tile_mode(struct radeon_surf *surf, const struct radeon_info *info)
 {
-       uint32_t tile_mode = info->si_tile_mode_array[surf->u.legacy.tiling_index[0]];
+   uint32_t tile_mode = info->si_tile_mode_array[surf->u.legacy.tiling_index[0]];
 
-       if (info->chip_class >= GFX7)
-               surf->micro_tile_mode = G_009910_MICRO_TILE_MODE_NEW(tile_mode);
-       else
-               surf->micro_tile_mode = G_009910_MICRO_TILE_MODE(tile_mode);
+   if (info->chip_class >= GFX7)
+      surf->micro_tile_mode = G_009910_MICRO_TILE_MODE_NEW(tile_mode);
+   else
+      surf->micro_tile_mode = G_009910_MICRO_TILE_MODE(tile_mode);
 }
 
 static unsigned cik_get_macro_tile_index(struct radeon_surf *surf)
 {
-       unsigned index, tileb;
+   unsigned index, tileb;
 
-       tileb = 8 * 8 * surf->bpe;
-       tileb = MIN2(surf->u.legacy.tile_split, tileb);
+   tileb = 8 * 8 * surf->bpe;
+   tileb = MIN2(surf->u.legacy.tile_split, tileb);
 
-       for (index = 0; tileb > 64; index++)
-               tileb >>= 1;
+   for (index = 0; tileb > 64; index++)
+      tileb >>= 1;
 
-       assert(index < 16);
-       return index;
+   assert(index < 16);
+   return index;
 }
 
-static bool get_display_flag(const struct ac_surf_config *config,
-                            const struct radeon_surf *surf)
+static bool get_display_flag(const struct ac_surf_config *config, const struct radeon_surf *surf)
 {
-       unsigned num_channels = config->info.num_channels;
-       unsigned bpe = surf->bpe;
-
-       if (!config->is_3d &&
-           !config->is_cube &&
-           !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) &&
-           surf->flags & RADEON_SURF_SCANOUT &&
-           config->info.samples <= 1 &&
-           surf->blk_w <= 2 && surf->blk_h == 1) {
-               /* subsampled */
-               if (surf->blk_w == 2 && surf->blk_h == 1)
-                       return true;
-
-               if  (/* RGBA8 or RGBA16F */
-                    (bpe >= 4 && bpe <= 8 && num_channels == 4) ||
-                    /* R5G6B5 or R5G5B5A1 */
-                    (bpe == 2 && num_channels >= 3) ||
-                    /* C8 palette */
-                    (bpe == 1 && num_channels == 1))
-                       return true;
-       }
-       return false;
+   unsigned num_channels = config->info.num_channels;
+   unsigned bpe = surf->bpe;
+
+   if (!config->is_3d && !config->is_cube && !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) &&
+       surf->flags & RADEON_SURF_SCANOUT && config->info.samples <= 1 && surf->blk_w <= 2 &&
+       surf->blk_h == 1) {
+      /* subsampled */
+      if (surf->blk_w == 2 && surf->blk_h == 1)
+         return true;
+
+      if (/* RGBA8 or RGBA16F */
+          (bpe >= 4 && bpe <= 8 && num_channels == 4) ||
+          /* R5G6B5 or R5G5B5A1 */
+          (bpe == 2 && num_channels >= 3) ||
+          /* C8 palette */
+          (bpe == 1 && num_channels == 1))
+         return true;
+   }
+   return false;
 }
 
 /**
@@ -745,119 +716,114 @@ static bool get_display_flag(const struct ac_surf_config *config,
  * Copy surface-global settings like pipe/bank config from level 0 surface
  * computation, and compute tile swizzle.
  */
-static int gfx6_surface_settings(ADDR_HANDLE addrlib,
-                                const struct radeon_info *info,
-                                const struct ac_surf_config *config,
-                                ADDR_COMPUTE_SURFACE_INFO_OUTPUT* csio,
-                                struct radeon_surf *surf)
+static int gfx6_surface_settings(ADDR_HANDLE addrlib, const struct radeon_info *info,
+                                 const struct ac_surf_config *config,
+                                 ADDR_COMPUTE_SURFACE_INFO_OUTPUT *csio, struct radeon_surf *surf)
 {
-       surf->surf_alignment = csio->baseAlign;
-       surf->u.legacy.pipe_config = csio->pTileInfo->pipeConfig - 1;
-       gfx6_set_micro_tile_mode(surf, info);
-
-       /* For 2D modes only. */
-       if (csio->tileMode >= ADDR_TM_2D_TILED_THIN1) {
-               surf->u.legacy.bankw = csio->pTileInfo->bankWidth;
-               surf->u.legacy.bankh = csio->pTileInfo->bankHeight;
-               surf->u.legacy.mtilea = csio->pTileInfo->macroAspectRatio;
-               surf->u.legacy.tile_split = csio->pTileInfo->tileSplitBytes;
-               surf->u.legacy.num_banks = csio->pTileInfo->banks;
-               surf->u.legacy.macro_tile_index = csio->macroModeIndex;
-       } else {
-               surf->u.legacy.macro_tile_index = 0;
-       }
-
-       /* Compute tile swizzle. */
-       /* TODO: fix tile swizzle with mipmapping for GFX6 */
-       if ((info->chip_class >= GFX7 || config->info.levels == 1) &&
-           config->info.surf_index &&
-           surf->u.legacy.level[0].mode == RADEON_SURF_MODE_2D &&
-           !(surf->flags & (RADEON_SURF_Z_OR_SBUFFER | RADEON_SURF_SHAREABLE)) &&
-           !get_display_flag(config, surf)) {
-               ADDR_COMPUTE_BASE_SWIZZLE_INPUT AddrBaseSwizzleIn = {0};
-               ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT AddrBaseSwizzleOut = {0};
-
-               AddrBaseSwizzleIn.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_INPUT);
-               AddrBaseSwizzleOut.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT);
-
-               AddrBaseSwizzleIn.surfIndex = p_atomic_inc_return(config->info.surf_index) - 1;
-               AddrBaseSwizzleIn.tileIndex = csio->tileIndex;
-               AddrBaseSwizzleIn.macroModeIndex = csio->macroModeIndex;
-               AddrBaseSwizzleIn.pTileInfo = csio->pTileInfo;
-               AddrBaseSwizzleIn.tileMode = csio->tileMode;
-
-               int r = AddrComputeBaseSwizzle(addrlib, &AddrBaseSwizzleIn,
-                                              &AddrBaseSwizzleOut);
-               if (r != ADDR_OK)
-                       return r;
-
-               assert(AddrBaseSwizzleOut.tileSwizzle <=
-                      u_bit_consecutive(0, sizeof(surf->tile_swizzle) * 8));
-               surf->tile_swizzle = AddrBaseSwizzleOut.tileSwizzle;
-       }
-       return 0;
+   surf->surf_alignment = csio->baseAlign;
+   surf->u.legacy.pipe_config = csio->pTileInfo->pipeConfig - 1;
+   gfx6_set_micro_tile_mode(surf, info);
+
+   /* For 2D modes only. */
+   if (csio->tileMode >= ADDR_TM_2D_TILED_THIN1) {
+      surf->u.legacy.bankw = csio->pTileInfo->bankWidth;
+      surf->u.legacy.bankh = csio->pTileInfo->bankHeight;
+      surf->u.legacy.mtilea = csio->pTileInfo->macroAspectRatio;
+      surf->u.legacy.tile_split = csio->pTileInfo->tileSplitBytes;
+      surf->u.legacy.num_banks = csio->pTileInfo->banks;
+      surf->u.legacy.macro_tile_index = csio->macroModeIndex;
+   } else {
+      surf->u.legacy.macro_tile_index = 0;
+   }
+
+   /* Compute tile swizzle. */
+   /* TODO: fix tile swizzle with mipmapping for GFX6 */
+   if ((info->chip_class >= GFX7 || config->info.levels == 1) && config->info.surf_index &&
+       surf->u.legacy.level[0].mode == RADEON_SURF_MODE_2D &&
+       !(surf->flags & (RADEON_SURF_Z_OR_SBUFFER | RADEON_SURF_SHAREABLE)) &&
+       !get_display_flag(config, surf)) {
+      ADDR_COMPUTE_BASE_SWIZZLE_INPUT AddrBaseSwizzleIn = {0};
+      ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT AddrBaseSwizzleOut = {0};
+
+      AddrBaseSwizzleIn.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_INPUT);
+      AddrBaseSwizzleOut.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT);
+
+      AddrBaseSwizzleIn.surfIndex = p_atomic_inc_return(config->info.surf_index) - 1;
+      AddrBaseSwizzleIn.tileIndex = csio->tileIndex;
+      AddrBaseSwizzleIn.macroModeIndex = csio->macroModeIndex;
+      AddrBaseSwizzleIn.pTileInfo = csio->pTileInfo;
+      AddrBaseSwizzleIn.tileMode = csio->tileMode;
+
+      int r = AddrComputeBaseSwizzle(addrlib, &AddrBaseSwizzleIn, &AddrBaseSwizzleOut);
+      if (r != ADDR_OK)
+         return r;
+
+      assert(AddrBaseSwizzleOut.tileSwizzle <=
+             u_bit_consecutive(0, sizeof(surf->tile_swizzle) * 8));
+      surf->tile_swizzle = AddrBaseSwizzleOut.tileSwizzle;
+   }
+   return 0;
 }
 
-static void ac_compute_cmask(const struct radeon_info *info,
-                            const struct ac_surf_config *config,
-                            struct radeon_surf *surf)
+static void ac_compute_cmask(const struct radeon_info *info, const struct ac_surf_config *config,
+                             struct radeon_surf *surf)
 {
-       unsigned pipe_interleave_bytes = info->pipe_interleave_bytes;
-       unsigned num_pipes = info->num_tile_pipes;
-       unsigned cl_width, cl_height;
-
-       if (surf->flags & RADEON_SURF_Z_OR_SBUFFER || surf->is_linear ||
-           (config->info.samples >= 2 && !surf->fmask_size))
-               return;
-
-       assert(info->chip_class <= GFX8);
-
-       switch (num_pipes) {
-       case 2:
-               cl_width = 32;
-               cl_height = 16;
-               break;
-       case 4:
-               cl_width = 32;
-               cl_height = 32;
-               break;
-       case 8:
-               cl_width = 64;
-               cl_height = 32;
-               break;
-       case 16: /* Hawaii */
-               cl_width = 64;
-               cl_height = 64;
-               break;
-       default:
-               assert(0);
-               return;
-       }
-
-       unsigned base_align = num_pipes * pipe_interleave_bytes;
-
-       unsigned width = align(surf->u.legacy.level[0].nblk_x, cl_width*8);
-       unsigned height = align(surf->u.legacy.level[0].nblk_y, cl_height*8);
-       unsigned slice_elements = (width * height) / (8*8);
-
-       /* Each element of CMASK is a nibble. */
-       unsigned slice_bytes = slice_elements / 2;
-
-       surf->u.legacy.cmask_slice_tile_max = (width * height) / (128*128);
-       if (surf->u.legacy.cmask_slice_tile_max)
-               surf->u.legacy.cmask_slice_tile_max -= 1;
-
-       unsigned num_layers;
-       if (config->is_3d)
-               num_layers = config->info.depth;
-       else if (config->is_cube)
-               num_layers = 6;
-       else
-               num_layers = config->info.array_size;
-
-       surf->cmask_alignment = MAX2(256, base_align);
-       surf->cmask_slice_size = align(slice_bytes, base_align);
-       surf->cmask_size = surf->cmask_slice_size * num_layers;
+   unsigned pipe_interleave_bytes = info->pipe_interleave_bytes;
+   unsigned num_pipes = info->num_tile_pipes;
+   unsigned cl_width, cl_height;
+
+   if (surf->flags & RADEON_SURF_Z_OR_SBUFFER || surf->is_linear ||
+       (config->info.samples >= 2 && !surf->fmask_size))
+      return;
+
+   assert(info->chip_class <= GFX8);
+
+   switch (num_pipes) {
+   case 2:
+      cl_width = 32;
+      cl_height = 16;
+      break;
+   case 4:
+      cl_width = 32;
+      cl_height = 32;
+      break;
+   case 8:
+      cl_width = 64;
+      cl_height = 32;
+      break;
+   case 16: /* Hawaii */
+      cl_width = 64;
+      cl_height = 64;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   unsigned base_align = num_pipes * pipe_interleave_bytes;
+
+   unsigned width = align(surf->u.legacy.level[0].nblk_x, cl_width * 8);
+   unsigned height = align(surf->u.legacy.level[0].nblk_y, cl_height * 8);
+   unsigned slice_elements = (width * height) / (8 * 8);
+
+   /* Each element of CMASK is a nibble. */
+   unsigned slice_bytes = slice_elements / 2;
+
+   surf->u.legacy.cmask_slice_tile_max = (width * height) / (128 * 128);
+   if (surf->u.legacy.cmask_slice_tile_max)
+      surf->u.legacy.cmask_slice_tile_max -= 1;
+
+   unsigned num_layers;
+   if (config->is_3d)
+      num_layers = config->info.depth;
+   else if (config->is_cube)
+      num_layers = 6;
+   else
+      num_layers = config->info.array_size;
+
+   surf->cmask_alignment = MAX2(256, base_align);
+   surf->cmask_slice_size = align(slice_bytes, base_align);
+   surf->cmask_size = surf->cmask_slice_size * num_layers;
 }
 
 /**
@@ -866,1416 +832,1327 @@ static void ac_compute_cmask(const struct radeon_info *info,
  * The following fields of \p surf must be initialized by the caller:
  * blk_w, blk_h, bpe, flags.
  */
-static int gfx6_compute_surface(ADDR_HANDLE addrlib,
-                               const struct radeon_info *info,
-                               const struct ac_surf_config *config,
-                               enum radeon_surf_mode mode,
-                               struct radeon_surf *surf)
+static int gfx6_compute_surface(ADDR_HANDLE addrlib, const struct radeon_info *info,
+                                const struct ac_surf_config *config, enum radeon_surf_mode mode,
+                                struct radeon_surf *surf)
 {
-       unsigned level;
-       bool compressed;
-       ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
-       ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0};
-       ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0};
-       ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0};
-       ADDR_COMPUTE_HTILE_INFO_INPUT AddrHtileIn = {0};
-       ADDR_COMPUTE_HTILE_INFO_OUTPUT AddrHtileOut = {0};
-       ADDR_TILEINFO AddrTileInfoIn = {0};
-       ADDR_TILEINFO AddrTileInfoOut = {0};
-       int r;
-
-       AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT);
-       AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT);
-       AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT);
-       AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT);
-       AddrHtileIn.size = sizeof(ADDR_COMPUTE_HTILE_INFO_INPUT);
-       AddrHtileOut.size = sizeof(ADDR_COMPUTE_HTILE_INFO_OUTPUT);
-       AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut;
-
-       compressed = surf->blk_w == 4 && surf->blk_h == 4;
-
-       /* MSAA requires 2D tiling. */
-       if (config->info.samples > 1)
-               mode = RADEON_SURF_MODE_2D;
-
-       /* DB doesn't support linear layouts. */
-       if (surf->flags & (RADEON_SURF_Z_OR_SBUFFER) &&
-           mode < RADEON_SURF_MODE_1D)
-               mode = RADEON_SURF_MODE_1D;
-
-       /* Set the requested tiling mode. */
-       switch (mode) {
-       case RADEON_SURF_MODE_LINEAR_ALIGNED:
-               AddrSurfInfoIn.tileMode = ADDR_TM_LINEAR_ALIGNED;
-               break;
-       case RADEON_SURF_MODE_1D:
-               AddrSurfInfoIn.tileMode = ADDR_TM_1D_TILED_THIN1;
-               break;
-       case RADEON_SURF_MODE_2D:
-               AddrSurfInfoIn.tileMode = ADDR_TM_2D_TILED_THIN1;
-               break;
-       default:
-               assert(0);
-       }
-
-       /* The format must be set correctly for the allocation of compressed
-        * textures to work. In other cases, setting the bpp is sufficient.
-        */
-       if (compressed) {
-               switch (surf->bpe) {
-               case 8:
-                       AddrSurfInfoIn.format = ADDR_FMT_BC1;
-                       break;
-               case 16:
-                       AddrSurfInfoIn.format = ADDR_FMT_BC3;
-                       break;
-               default:
-                       assert(0);
-               }
-       }
-       else {
-               AddrDccIn.bpp = AddrSurfInfoIn.bpp = surf->bpe * 8;
-       }
-
-       AddrDccIn.numSamples = AddrSurfInfoIn.numSamples =
-               MAX2(1, config->info.samples);
-       AddrSurfInfoIn.tileIndex = -1;
-
-       if (!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)) {
-               AddrDccIn.numSamples = AddrSurfInfoIn.numFrags =
-                       MAX2(1, config->info.storage_samples);
-       }
-
-       /* Set the micro tile type. */
-       if (surf->flags & RADEON_SURF_SCANOUT)
-               AddrSurfInfoIn.tileType = ADDR_DISPLAYABLE;
-       else if (surf->flags & RADEON_SURF_Z_OR_SBUFFER)
-               AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER;
-       else
-               AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE;
-
-       AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
-       AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0;
-       AddrSurfInfoIn.flags.cube = config->is_cube;
-       AddrSurfInfoIn.flags.display = get_display_flag(config, surf);
-       AddrSurfInfoIn.flags.pow2Pad = config->info.levels > 1;
-       AddrSurfInfoIn.flags.tcCompatible = (surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) != 0;
-
-       /* Only degrade the tile mode for space if TC-compatible HTILE hasn't been
-        * requested, because TC-compatible HTILE requires 2D tiling.
-        */
-       AddrSurfInfoIn.flags.opt4Space = !AddrSurfInfoIn.flags.tcCompatible &&
-                                        !AddrSurfInfoIn.flags.fmask &&
-                                        config->info.samples <= 1 &&
-                                        !(surf->flags & RADEON_SURF_FORCE_SWIZZLE_MODE);
-
-       /* DCC notes:
-        * - If we add MSAA support, keep in mind that CB can't decompress 8bpp
-        *   with samples >= 4.
-        * - Mipmapped array textures have low performance (discovered by a closed
-        *   driver team).
-        */
-       AddrSurfInfoIn.flags.dccCompatible =
-               info->chip_class >= GFX8 &&
-               info->has_graphics && /* disable DCC on compute-only chips */
-               !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) &&
-               !(surf->flags & RADEON_SURF_DISABLE_DCC) &&
-               !compressed &&
-               ((config->info.array_size == 1 && config->info.depth == 1) ||
-                config->info.levels == 1);
-
-       AddrSurfInfoIn.flags.noStencil = (surf->flags & RADEON_SURF_SBUFFER) == 0;
-       AddrSurfInfoIn.flags.compressZ = !!(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
-
-       /* On GFX7-GFX8, the DB uses the same pitch and tile mode (except tilesplit)
-        * for Z and stencil. This can cause a number of problems which we work
-        * around here:
-        *
-        * - a depth part that is incompatible with mipmapped texturing
-        * - at least on Stoney, entirely incompatible Z/S aspects (e.g.
-        *   incorrect tiling applied to the stencil part, stencil buffer
-        *   memory accesses that go out of bounds) even without mipmapping
-        *
-        * Some piglit tests that are prone to different types of related
-        * failures:
-        *  ./bin/ext_framebuffer_multisample-upsample 2 stencil
-        *  ./bin/framebuffer-blit-levels {draw,read} stencil
-        *  ./bin/ext_framebuffer_multisample-unaligned-blit N {depth,stencil} {msaa,upsample,downsample}
-        *  ./bin/fbo-depth-array fs-writes-{depth,stencil} / {depth,stencil}-{clear,layered-clear,draw}
-        *  ./bin/depthstencil-render-miplevels 1024 d=s=z24_s8
-        */
-       int stencil_tile_idx = -1;
-
-       if (AddrSurfInfoIn.flags.depth && !AddrSurfInfoIn.flags.noStencil &&
-           (config->info.levels > 1 || info->family == CHIP_STONEY)) {
-               /* Compute stencilTileIdx that is compatible with the (depth)
-                * tileIdx. This degrades the depth surface if necessary to
-                * ensure that a matching stencilTileIdx exists. */
-               AddrSurfInfoIn.flags.matchStencilTileCfg = 1;
-
-               /* Keep the depth mip-tail compatible with texturing. */
-               AddrSurfInfoIn.flags.noStencil = 1;
-       }
-
-       /* Set preferred macrotile parameters. This is usually required
-        * for shared resources. This is for 2D tiling only. */
-       if (AddrSurfInfoIn.tileMode >= ADDR_TM_2D_TILED_THIN1 &&
-           surf->u.legacy.bankw && surf->u.legacy.bankh &&
-           surf->u.legacy.mtilea && surf->u.legacy.tile_split) {
-               /* If any of these parameters are incorrect, the calculation
-                * will fail. */
-               AddrTileInfoIn.banks = surf->u.legacy.num_banks;
-               AddrTileInfoIn.bankWidth = surf->u.legacy.bankw;
-               AddrTileInfoIn.bankHeight = surf->u.legacy.bankh;
-               AddrTileInfoIn.macroAspectRatio = surf->u.legacy.mtilea;
-               AddrTileInfoIn.tileSplitBytes = surf->u.legacy.tile_split;
-               AddrTileInfoIn.pipeConfig = surf->u.legacy.pipe_config + 1; /* +1 compared to GB_TILE_MODE */
-               AddrSurfInfoIn.flags.opt4Space = 0;
-               AddrSurfInfoIn.pTileInfo = &AddrTileInfoIn;
-
-               /* If AddrSurfInfoIn.pTileInfo is set, Addrlib doesn't set
-                * the tile index, because we are expected to know it if
-                * we know the other parameters.
-                *
-                * This is something that can easily be fixed in Addrlib.
-                * For now, just figure it out here.
-                * Note that only 2D_TILE_THIN1 is handled here.
-                */
-               assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
-               assert(AddrSurfInfoIn.tileMode == ADDR_TM_2D_TILED_THIN1);
-
-               if (info->chip_class == GFX6) {
-                       if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) {
-                               if (surf->bpe == 2)
-                                       AddrSurfInfoIn.tileIndex = 11; /* 16bpp */
-                               else
-                                       AddrSurfInfoIn.tileIndex = 12; /* 32bpp */
-                       } else {
-                               if (surf->bpe == 1)
-                                       AddrSurfInfoIn.tileIndex = 14; /* 8bpp */
-                               else if (surf->bpe == 2)
-                                       AddrSurfInfoIn.tileIndex = 15; /* 16bpp */
-                               else if (surf->bpe == 4)
-                                       AddrSurfInfoIn.tileIndex = 16; /* 32bpp */
-                               else
-                                       AddrSurfInfoIn.tileIndex = 17; /* 64bpp (and 128bpp) */
-                       }
-               } else {
-                       /* GFX7 - GFX8 */
-                       if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE)
-                               AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */
-                       else
-                               AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */
-
-                       /* Addrlib doesn't set this if tileIndex is forced like above. */
-                       AddrSurfInfoOut.macroModeIndex = cik_get_macro_tile_index(surf);
-               }
-       }
-
-       surf->has_stencil = !!(surf->flags & RADEON_SURF_SBUFFER);
-       surf->num_dcc_levels = 0;
-       surf->surf_size = 0;
-       surf->dcc_size = 0;
-       surf->dcc_alignment = 1;
-       surf->htile_size = 0;
-       surf->htile_slice_size = 0;
-       surf->htile_alignment = 1;
-
-       const bool only_stencil = (surf->flags & RADEON_SURF_SBUFFER) &&
-                                 !(surf->flags & RADEON_SURF_ZBUFFER);
-
-       /* Calculate texture layout information. */
-       if (!only_stencil) {
-               for (level = 0; level < config->info.levels; level++) {
-                       r = gfx6_compute_level(addrlib, config, surf, false, level, compressed,
-                                              &AddrSurfInfoIn, &AddrSurfInfoOut,
-                                              &AddrDccIn, &AddrDccOut, &AddrHtileIn, &AddrHtileOut);
-                       if (r)
-                               return r;
-
-                       if (level > 0)
-                               continue;
-
-                       if (!AddrSurfInfoOut.tcCompatible) {
-                               AddrSurfInfoIn.flags.tcCompatible = 0;
-                               surf->flags &= ~RADEON_SURF_TC_COMPATIBLE_HTILE;
-                       }
-
-                       if (AddrSurfInfoIn.flags.matchStencilTileCfg) {
-                               AddrSurfInfoIn.flags.matchStencilTileCfg = 0;
-                               AddrSurfInfoIn.tileIndex = AddrSurfInfoOut.tileIndex;
-                               stencil_tile_idx = AddrSurfInfoOut.stencilTileIdx;
-
-                               assert(stencil_tile_idx >= 0);
-                       }
-
-                       r = gfx6_surface_settings(addrlib, info, config,
-                                                 &AddrSurfInfoOut, surf);
-                       if (r)
-                               return r;
-               }
-       }
-
-       /* Calculate texture layout information for stencil. */
-       if (surf->flags & RADEON_SURF_SBUFFER) {
-               AddrSurfInfoIn.tileIndex = stencil_tile_idx;
-               AddrSurfInfoIn.bpp = 8;
-               AddrSurfInfoIn.flags.depth = 0;
-               AddrSurfInfoIn.flags.stencil = 1;
-               AddrSurfInfoIn.flags.tcCompatible = 0;
-               /* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */
-               AddrTileInfoIn.tileSplitBytes = surf->u.legacy.stencil_tile_split;
-
-               for (level = 0; level < config->info.levels; level++) {
-                       r = gfx6_compute_level(addrlib, config, surf, true, level, compressed,
-                                              &AddrSurfInfoIn, &AddrSurfInfoOut,
-                                              &AddrDccIn, &AddrDccOut,
-                                              NULL, NULL);
-                       if (r)
-                               return r;
-
-                       /* DB uses the depth pitch for both stencil and depth. */
-                       if (!only_stencil) {
-                               if (surf->u.legacy.stencil_level[level].nblk_x !=
-                                   surf->u.legacy.level[level].nblk_x)
-                                       surf->u.legacy.stencil_adjusted = true;
-                       } else {
-                               surf->u.legacy.level[level].nblk_x =
-                                       surf->u.legacy.stencil_level[level].nblk_x;
-                       }
-
-                       if (level == 0) {
-                               if (only_stencil) {
-                                       r = gfx6_surface_settings(addrlib, info, config,
-                                                                 &AddrSurfInfoOut, surf);
-                                       if (r)
-                                               return r;
-                               }
-
-                               /* For 2D modes only. */
-                               if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
-                                       surf->u.legacy.stencil_tile_split =
-                                               AddrSurfInfoOut.pTileInfo->tileSplitBytes;
-                               }
-                       }
-               }
-       }
-
-       /* Compute FMASK. */
-       if (config->info.samples >= 2 && AddrSurfInfoIn.flags.color &&
-           info->has_graphics && !(surf->flags & RADEON_SURF_NO_FMASK)) {
-               ADDR_COMPUTE_FMASK_INFO_INPUT fin = {0};
-               ADDR_COMPUTE_FMASK_INFO_OUTPUT fout = {0};
-               ADDR_TILEINFO fmask_tile_info = {};
-
-               fin.size = sizeof(fin);
-               fout.size = sizeof(fout);
-
-               fin.tileMode = AddrSurfInfoOut.tileMode;
-               fin.pitch = AddrSurfInfoOut.pitch;
-               fin.height = config->info.height;
-               fin.numSlices = AddrSurfInfoIn.numSlices;
-               fin.numSamples = AddrSurfInfoIn.numSamples;
-               fin.numFrags = AddrSurfInfoIn.numFrags;
-               fin.tileIndex = -1;
-               fout.pTileInfo = &fmask_tile_info;
-
-               r = AddrComputeFmaskInfo(addrlib, &fin, &fout);
-               if (r)
-                       return r;
-
-               surf->fmask_size = fout.fmaskBytes;
-               surf->fmask_alignment = fout.baseAlign;
-               surf->fmask_tile_swizzle = 0;
-
-               surf->u.legacy.fmask.slice_tile_max =
-                       (fout.pitch * fout.height) / 64;
-               if (surf->u.legacy.fmask.slice_tile_max)
-                   surf->u.legacy.fmask.slice_tile_max -= 1;
-
-               surf->u.legacy.fmask.tiling_index = fout.tileIndex;
-               surf->u.legacy.fmask.bankh = fout.pTileInfo->bankHeight;
-               surf->u.legacy.fmask.pitch_in_pixels = fout.pitch;
-               surf->u.legacy.fmask.slice_size = fout.sliceSize;
-
-               /* Compute tile swizzle for FMASK. */
-               if (config->info.fmask_surf_index &&
-                   !(surf->flags & RADEON_SURF_SHAREABLE)) {
-                       ADDR_COMPUTE_BASE_SWIZZLE_INPUT xin = {0};
-                       ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT xout = {0};
-
-                       xin.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_INPUT);
-                       xout.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT);
-
-                       /* This counter starts from 1 instead of 0. */
-                       xin.surfIndex = p_atomic_inc_return(config->info.fmask_surf_index);
-                       xin.tileIndex = fout.tileIndex;
-                       xin.macroModeIndex = fout.macroModeIndex;
-                       xin.pTileInfo = fout.pTileInfo;
-                       xin.tileMode = fin.tileMode;
-
-                       int r = AddrComputeBaseSwizzle(addrlib, &xin, &xout);
-                       if (r != ADDR_OK)
-                               return r;
-
-                       assert(xout.tileSwizzle <=
-                              u_bit_consecutive(0, sizeof(surf->tile_swizzle) * 8));
-                       surf->fmask_tile_swizzle = xout.tileSwizzle;
-               }
-       }
-
-       /* Recalculate the whole DCC miptree size including disabled levels.
-        * This is what addrlib does, but calling addrlib would be a lot more
-        * complicated.
-        */
-       if (surf->dcc_size && config->info.levels > 1) {
-               /* The smallest miplevels that are never compressed by DCC
-                * still read the DCC buffer via TC if the base level uses DCC,
-                * and for some reason the DCC buffer needs to be larger if
-                * the miptree uses non-zero tile_swizzle. Otherwise there are
-                * VM faults.
-                *
-                * "dcc_alignment * 4" was determined by trial and error.
-                */
-               surf->dcc_size = align64(surf->surf_size >> 8,
-                                        surf->dcc_alignment * 4);
-       }
-
-       /* Make sure HTILE covers the whole miptree, because the shader reads
-        * TC-compatible HTILE even for levels where it's disabled by DB.
-        */
-       if (surf->htile_size && config->info.levels > 1 &&
-           surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) {
-               /* MSAA can't occur with levels > 1, so ignore the sample count. */
-               const unsigned total_pixels = surf->surf_size / surf->bpe;
-               const unsigned htile_block_size = 8 * 8;
-               const unsigned htile_element_size = 4;
-
-               surf->htile_size = (total_pixels / htile_block_size) *
-                                  htile_element_size;
-               surf->htile_size = align(surf->htile_size, surf->htile_alignment);
-       } else if (!surf->htile_size) {
-               /* Unset this if HTILE is not present. */
-               surf->flags &= ~RADEON_SURF_TC_COMPATIBLE_HTILE;
-       }
-
-       surf->is_linear = surf->u.legacy.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED;
-       surf->is_displayable = surf->is_linear ||
-                              surf->micro_tile_mode == RADEON_MICRO_MODE_DISPLAY ||
-                              surf->micro_tile_mode == RADEON_MICRO_MODE_RENDER;
-
-       /* The rotated micro tile mode doesn't work if both CMASK and RB+ are
-        * used at the same time. This case is not currently expected to occur
-        * because we don't use rotated. Enforce this restriction on all chips
-        * to facilitate testing.
-        */
-       if (surf->micro_tile_mode == RADEON_MICRO_MODE_RENDER) {
-               assert(!"rotate micro tile mode is unsupported");
-               return ADDR_ERROR;
-       }
-
-       ac_compute_cmask(info, config, surf);
-       return 0;
+   unsigned level;
+   bool compressed;
+   ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
+   ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0};
+   ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0};
+   ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0};
+   ADDR_COMPUTE_HTILE_INFO_INPUT AddrHtileIn = {0};
+   ADDR_COMPUTE_HTILE_INFO_OUTPUT AddrHtileOut = {0};
+   ADDR_TILEINFO AddrTileInfoIn = {0};
+   ADDR_TILEINFO AddrTileInfoOut = {0};
+   int r;
+
+   AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT);
+   AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT);
+   AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT);
+   AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT);
+   AddrHtileIn.size = sizeof(ADDR_COMPUTE_HTILE_INFO_INPUT);
+   AddrHtileOut.size = sizeof(ADDR_COMPUTE_HTILE_INFO_OUTPUT);
+   AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut;
+
+   compressed = surf->blk_w == 4 && surf->blk_h == 4;
+
+   /* MSAA requires 2D tiling. */
+   if (config->info.samples > 1)
+      mode = RADEON_SURF_MODE_2D;
+
+   /* DB doesn't support linear layouts. */
+   if (surf->flags & (RADEON_SURF_Z_OR_SBUFFER) && mode < RADEON_SURF_MODE_1D)
+      mode = RADEON_SURF_MODE_1D;
+
+   /* Set the requested tiling mode. */
+   switch (mode) {
+   case RADEON_SURF_MODE_LINEAR_ALIGNED:
+      AddrSurfInfoIn.tileMode = ADDR_TM_LINEAR_ALIGNED;
+      break;
+   case RADEON_SURF_MODE_1D:
+      AddrSurfInfoIn.tileMode = ADDR_TM_1D_TILED_THIN1;
+      break;
+   case RADEON_SURF_MODE_2D:
+      AddrSurfInfoIn.tileMode = ADDR_TM_2D_TILED_THIN1;
+      break;
+   default:
+      assert(0);
+   }
+
+   /* The format must be set correctly for the allocation of compressed
+    * textures to work. In other cases, setting the bpp is sufficient.
+    */
+   if (compressed) {
+      switch (surf->bpe) {
+      case 8:
+         AddrSurfInfoIn.format = ADDR_FMT_BC1;
+         break;
+      case 16:
+         AddrSurfInfoIn.format = ADDR_FMT_BC3;
+         break;
+      default:
+         assert(0);
+      }
+   } else {
+      AddrDccIn.bpp = AddrSurfInfoIn.bpp = surf->bpe * 8;
+   }
+
+   AddrDccIn.numSamples = AddrSurfInfoIn.numSamples = MAX2(1, config->info.samples);
+   AddrSurfInfoIn.tileIndex = -1;
+
+   if (!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)) {
+      AddrDccIn.numSamples = AddrSurfInfoIn.numFrags = MAX2(1, config->info.storage_samples);
+   }
+
+   /* Set the micro tile type. */
+   if (surf->flags & RADEON_SURF_SCANOUT)
+      AddrSurfInfoIn.tileType = ADDR_DISPLAYABLE;
+   else if (surf->flags & RADEON_SURF_Z_OR_SBUFFER)
+      AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER;
+   else
+      AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE;
+
+   AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
+   AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0;
+   AddrSurfInfoIn.flags.cube = config->is_cube;
+   AddrSurfInfoIn.flags.display = get_display_flag(config, surf);
+   AddrSurfInfoIn.flags.pow2Pad = config->info.levels > 1;
+   AddrSurfInfoIn.flags.tcCompatible = (surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) != 0;
+
+   /* Only degrade the tile mode for space if TC-compatible HTILE hasn't been
+    * requested, because TC-compatible HTILE requires 2D tiling.
+    */
+   AddrSurfInfoIn.flags.opt4Space = !AddrSurfInfoIn.flags.tcCompatible &&
+                                    !AddrSurfInfoIn.flags.fmask && config->info.samples <= 1 &&
+                                    !(surf->flags & RADEON_SURF_FORCE_SWIZZLE_MODE);
+
+   /* DCC notes:
+    * - If we add MSAA support, keep in mind that CB can't decompress 8bpp
+    *   with samples >= 4.
+    * - Mipmapped array textures have low performance (discovered by a closed
+    *   driver team).
+    */
+   AddrSurfInfoIn.flags.dccCompatible =
+      info->chip_class >= GFX8 && info->has_graphics && /* disable DCC on compute-only chips */
+      !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) && !(surf->flags & RADEON_SURF_DISABLE_DCC) &&
+      !compressed &&
+      ((config->info.array_size == 1 && config->info.depth == 1) || config->info.levels == 1);
+
+   AddrSurfInfoIn.flags.noStencil = (surf->flags & RADEON_SURF_SBUFFER) == 0;
+   AddrSurfInfoIn.flags.compressZ = !!(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
+
+   /* On GFX7-GFX8, the DB uses the same pitch and tile mode (except tilesplit)
+    * for Z and stencil. This can cause a number of problems which we work
+    * around here:
+    *
+    * - a depth part that is incompatible with mipmapped texturing
+    * - at least on Stoney, entirely incompatible Z/S aspects (e.g.
+    *   incorrect tiling applied to the stencil part, stencil buffer
+    *   memory accesses that go out of bounds) even without mipmapping
+    *
+    * Some piglit tests that are prone to different types of related
+    * failures:
+    *  ./bin/ext_framebuffer_multisample-upsample 2 stencil
+    *  ./bin/framebuffer-blit-levels {draw,read} stencil
+    *  ./bin/ext_framebuffer_multisample-unaligned-blit N {depth,stencil} {msaa,upsample,downsample}
+    *  ./bin/fbo-depth-array fs-writes-{depth,stencil} / {depth,stencil}-{clear,layered-clear,draw}
+    *  ./bin/depthstencil-render-miplevels 1024 d=s=z24_s8
+    */
+   int stencil_tile_idx = -1;
+
+   if (AddrSurfInfoIn.flags.depth && !AddrSurfInfoIn.flags.noStencil &&
+       (config->info.levels > 1 || info->family == CHIP_STONEY)) {
+      /* Compute stencilTileIdx that is compatible with the (depth)
+       * tileIdx. This degrades the depth surface if necessary to
+       * ensure that a matching stencilTileIdx exists. */
+      AddrSurfInfoIn.flags.matchStencilTileCfg = 1;
+
+      /* Keep the depth mip-tail compatible with texturing. */
+      AddrSurfInfoIn.flags.noStencil = 1;
+   }
+
+   /* Set preferred macrotile parameters. This is usually required
+    * for shared resources. This is for 2D tiling only. */
+   if (AddrSurfInfoIn.tileMode >= ADDR_TM_2D_TILED_THIN1 && surf->u.legacy.bankw &&
+       surf->u.legacy.bankh && surf->u.legacy.mtilea && surf->u.legacy.tile_split) {
+      /* If any of these parameters are incorrect, the calculation
+       * will fail. */
+      AddrTileInfoIn.banks = surf->u.legacy.num_banks;
+      AddrTileInfoIn.bankWidth = surf->u.legacy.bankw;
+      AddrTileInfoIn.bankHeight = surf->u.legacy.bankh;
+      AddrTileInfoIn.macroAspectRatio = surf->u.legacy.mtilea;
+      AddrTileInfoIn.tileSplitBytes = surf->u.legacy.tile_split;
+      AddrTileInfoIn.pipeConfig = surf->u.legacy.pipe_config + 1; /* +1 compared to GB_TILE_MODE */
+      AddrSurfInfoIn.flags.opt4Space = 0;
+      AddrSurfInfoIn.pTileInfo = &AddrTileInfoIn;
+
+      /* If AddrSurfInfoIn.pTileInfo is set, Addrlib doesn't set
+       * the tile index, because we are expected to know it if
+       * we know the other parameters.
+       *
+       * This is something that can easily be fixed in Addrlib.
+       * For now, just figure it out here.
+       * Note that only 2D_TILE_THIN1 is handled here.
+       */
+      assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
+      assert(AddrSurfInfoIn.tileMode == ADDR_TM_2D_TILED_THIN1);
+
+      if (info->chip_class == GFX6) {
+         if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE) {
+            if (surf->bpe == 2)
+               AddrSurfInfoIn.tileIndex = 11; /* 16bpp */
+            else
+               AddrSurfInfoIn.tileIndex = 12; /* 32bpp */
+         } else {
+            if (surf->bpe == 1)
+               AddrSurfInfoIn.tileIndex = 14; /* 8bpp */
+            else if (surf->bpe == 2)
+               AddrSurfInfoIn.tileIndex = 15; /* 16bpp */
+            else if (surf->bpe == 4)
+               AddrSurfInfoIn.tileIndex = 16; /* 32bpp */
+            else
+               AddrSurfInfoIn.tileIndex = 17; /* 64bpp (and 128bpp) */
+         }
+      } else {
+         /* GFX7 - GFX8 */
+         if (AddrSurfInfoIn.tileType == ADDR_DISPLAYABLE)
+            AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */
+         else
+            AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */
+
+         /* Addrlib doesn't set this if tileIndex is forced like above. */
+         AddrSurfInfoOut.macroModeIndex = cik_get_macro_tile_index(surf);
+      }
+   }
+
+   surf->has_stencil = !!(surf->flags & RADEON_SURF_SBUFFER);
+   surf->num_dcc_levels = 0;
+   surf->surf_size = 0;
+   surf->dcc_size = 0;
+   surf->dcc_alignment = 1;
+   surf->htile_size = 0;
+   surf->htile_slice_size = 0;
+   surf->htile_alignment = 1;
+
+   const bool only_stencil =
+      (surf->flags & RADEON_SURF_SBUFFER) && !(surf->flags & RADEON_SURF_ZBUFFER);
+
+   /* Calculate texture layout information. */
+   if (!only_stencil) {
+      for (level = 0; level < config->info.levels; level++) {
+         r = gfx6_compute_level(addrlib, config, surf, false, level, compressed, &AddrSurfInfoIn,
+                                &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut, &AddrHtileIn,
+                                &AddrHtileOut);
+         if (r)
+            return r;
+
+         if (level > 0)
+            continue;
+
+         if (!AddrSurfInfoOut.tcCompatible) {
+            AddrSurfInfoIn.flags.tcCompatible = 0;
+            surf->flags &= ~RADEON_SURF_TC_COMPATIBLE_HTILE;
+         }
+
+         if (AddrSurfInfoIn.flags.matchStencilTileCfg) {
+            AddrSurfInfoIn.flags.matchStencilTileCfg = 0;
+            AddrSurfInfoIn.tileIndex = AddrSurfInfoOut.tileIndex;
+            stencil_tile_idx = AddrSurfInfoOut.stencilTileIdx;
+
+            assert(stencil_tile_idx >= 0);
+         }
+
+         r = gfx6_surface_settings(addrlib, info, config, &AddrSurfInfoOut, surf);
+         if (r)
+            return r;
+      }
+   }
+
+   /* Calculate texture layout information for stencil. */
+   if (surf->flags & RADEON_SURF_SBUFFER) {
+      AddrSurfInfoIn.tileIndex = stencil_tile_idx;
+      AddrSurfInfoIn.bpp = 8;
+      AddrSurfInfoIn.flags.depth = 0;
+      AddrSurfInfoIn.flags.stencil = 1;
+      AddrSurfInfoIn.flags.tcCompatible = 0;
+      /* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */
+      AddrTileInfoIn.tileSplitBytes = surf->u.legacy.stencil_tile_split;
+
+      for (level = 0; level < config->info.levels; level++) {
+         r = gfx6_compute_level(addrlib, config, surf, true, level, compressed, &AddrSurfInfoIn,
+                                &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut, NULL, NULL);
+         if (r)
+            return r;
+
+         /* DB uses the depth pitch for both stencil and depth. */
+         if (!only_stencil) {
+            if (surf->u.legacy.stencil_level[level].nblk_x != surf->u.legacy.level[level].nblk_x)
+               surf->u.legacy.stencil_adjusted = true;
+         } else {
+            surf->u.legacy.level[level].nblk_x = surf->u.legacy.stencil_level[level].nblk_x;
+         }
+
+         if (level == 0) {
+            if (only_stencil) {
+               r = gfx6_surface_settings(addrlib, info, config, &AddrSurfInfoOut, surf);
+               if (r)
+                  return r;
+            }
+
+            /* For 2D modes only. */
+            if (AddrSurfInfoOut.tileMode >= ADDR_TM_2D_TILED_THIN1) {
+               surf->u.legacy.stencil_tile_split = AddrSurfInfoOut.pTileInfo->tileSplitBytes;
+            }
+         }
+      }
+   }
+
+   /* Compute FMASK. */
+   if (config->info.samples >= 2 && AddrSurfInfoIn.flags.color && info->has_graphics &&
+       !(surf->flags & RADEON_SURF_NO_FMASK)) {
+      ADDR_COMPUTE_FMASK_INFO_INPUT fin = {0};
+      ADDR_COMPUTE_FMASK_INFO_OUTPUT fout = {0};
+      ADDR_TILEINFO fmask_tile_info = {};
+
+      fin.size = sizeof(fin);
+      fout.size = sizeof(fout);
+
+      fin.tileMode = AddrSurfInfoOut.tileMode;
+      fin.pitch = AddrSurfInfoOut.pitch;
+      fin.height = config->info.height;
+      fin.numSlices = AddrSurfInfoIn.numSlices;
+      fin.numSamples = AddrSurfInfoIn.numSamples;
+      fin.numFrags = AddrSurfInfoIn.numFrags;
+      fin.tileIndex = -1;
+      fout.pTileInfo = &fmask_tile_info;
+
+      r = AddrComputeFmaskInfo(addrlib, &fin, &fout);
+      if (r)
+         return r;
+
+      surf->fmask_size = fout.fmaskBytes;
+      surf->fmask_alignment = fout.baseAlign;
+      surf->fmask_tile_swizzle = 0;
+
+      surf->u.legacy.fmask.slice_tile_max = (fout.pitch * fout.height) / 64;
+      if (surf->u.legacy.fmask.slice_tile_max)
+         surf->u.legacy.fmask.slice_tile_max -= 1;
+
+      surf->u.legacy.fmask.tiling_index = fout.tileIndex;
+      surf->u.legacy.fmask.bankh = fout.pTileInfo->bankHeight;
+      surf->u.legacy.fmask.pitch_in_pixels = fout.pitch;
+      surf->u.legacy.fmask.slice_size = fout.sliceSize;
+
+      /* Compute tile swizzle for FMASK. */
+      if (config->info.fmask_surf_index && !(surf->flags & RADEON_SURF_SHAREABLE)) {
+         ADDR_COMPUTE_BASE_SWIZZLE_INPUT xin = {0};
+         ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT xout = {0};
+
+         xin.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_INPUT);
+         xout.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT);
+
+         /* This counter starts from 1 instead of 0. */
+         xin.surfIndex = p_atomic_inc_return(config->info.fmask_surf_index);
+         xin.tileIndex = fout.tileIndex;
+         xin.macroModeIndex = fout.macroModeIndex;
+         xin.pTileInfo = fout.pTileInfo;
+         xin.tileMode = fin.tileMode;
+
+         int r = AddrComputeBaseSwizzle(addrlib, &xin, &xout);
+         if (r != ADDR_OK)
+            return r;
+
+         assert(xout.tileSwizzle <= u_bit_consecutive(0, sizeof(surf->tile_swizzle) * 8));
+         surf->fmask_tile_swizzle = xout.tileSwizzle;
+      }
+   }
+
+   /* Recalculate the whole DCC miptree size including disabled levels.
+    * This is what addrlib does, but calling addrlib would be a lot more
+    * complicated.
+    */
+   if (surf->dcc_size && config->info.levels > 1) {
+      /* The smallest miplevels that are never compressed by DCC
+       * still read the DCC buffer via TC if the base level uses DCC,
+       * and for some reason the DCC buffer needs to be larger if
+       * the miptree uses non-zero tile_swizzle. Otherwise there are
+       * VM faults.
+       *
+       * "dcc_alignment * 4" was determined by trial and error.
+       */
+      surf->dcc_size = align64(surf->surf_size >> 8, surf->dcc_alignment * 4);
+   }
+
+   /* Make sure HTILE covers the whole miptree, because the shader reads
+    * TC-compatible HTILE even for levels where it's disabled by DB.
+    */
+   if (surf->htile_size && config->info.levels > 1 &&
+       surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) {
+      /* MSAA can't occur with levels > 1, so ignore the sample count. */
+      const unsigned total_pixels = surf->surf_size / surf->bpe;
+      const unsigned htile_block_size = 8 * 8;
+      const unsigned htile_element_size = 4;
+
+      surf->htile_size = (total_pixels / htile_block_size) * htile_element_size;
+      surf->htile_size = align(surf->htile_size, surf->htile_alignment);
+   } else if (!surf->htile_size) {
+      /* Unset this if HTILE is not present. */
+      surf->flags &= ~RADEON_SURF_TC_COMPATIBLE_HTILE;
+   }
+
+   surf->is_linear = surf->u.legacy.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED;
+   surf->is_displayable = surf->is_linear || surf->micro_tile_mode == RADEON_MICRO_MODE_DISPLAY ||
+                          surf->micro_tile_mode == RADEON_MICRO_MODE_RENDER;
+
+   /* The rotated micro tile mode doesn't work if both CMASK and RB+ are
+    * used at the same time. This case is not currently expected to occur
+    * because we don't use rotated. Enforce this restriction on all chips
+    * to facilitate testing.
+    */
+   if (surf->micro_tile_mode == RADEON_MICRO_MODE_RENDER) {
+      assert(!"rotate micro tile mode is unsupported");
+      return ADDR_ERROR;
+   }
+
+   ac_compute_cmask(info, config, surf);
+   return 0;
 }
 
 /* This is only called when expecting a tiled layout. */
-static int
-gfx9_get_preferred_swizzle_mode(ADDR_HANDLE addrlib,
-                               struct radeon_surf *surf,
-                               ADDR2_COMPUTE_SURFACE_INFO_INPUT *in,
-                               bool is_fmask, AddrSwizzleMode *swizzle_mode)
+static int gfx9_get_preferred_swizzle_mode(ADDR_HANDLE addrlib, struct radeon_surf *surf,
+                                           ADDR2_COMPUTE_SURFACE_INFO_INPUT *in, bool is_fmask,
+                                           AddrSwizzleMode *swizzle_mode)
 {
-       ADDR_E_RETURNCODE ret;
-       ADDR2_GET_PREFERRED_SURF_SETTING_INPUT sin = {0};
-       ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT sout = {0};
-
-       sin.size = sizeof(ADDR2_GET_PREFERRED_SURF_SETTING_INPUT);
-       sout.size = sizeof(ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT);
-
-       sin.flags = in->flags;
-       sin.resourceType = in->resourceType;
-       sin.format = in->format;
-       sin.resourceLoction = ADDR_RSRC_LOC_INVIS;
-       /* TODO: We could allow some of these: */
-       sin.forbiddenBlock.micro = 1; /* don't allow the 256B swizzle modes */
-       sin.forbiddenBlock.var = 1; /* don't allow the variable-sized swizzle modes */
-       sin.bpp = in->bpp;
-       sin.width = in->width;
-       sin.height = in->height;
-       sin.numSlices = in->numSlices;
-       sin.numMipLevels = in->numMipLevels;
-       sin.numSamples = in->numSamples;
-       sin.numFrags = in->numFrags;
-
-       if (is_fmask) {
-               sin.flags.display = 0;
-               sin.flags.color = 0;
-               sin.flags.fmask = 1;
-       }
-
-       if (surf->flags & RADEON_SURF_FORCE_MICRO_TILE_MODE) {
-               sin.forbiddenBlock.linear = 1;
-
-               if (surf->micro_tile_mode == RADEON_MICRO_MODE_DISPLAY)
-                       sin.preferredSwSet.sw_D = 1;
-               else if (surf->micro_tile_mode == RADEON_MICRO_MODE_STANDARD)
-                       sin.preferredSwSet.sw_S = 1;
-               else if (surf->micro_tile_mode == RADEON_MICRO_MODE_DEPTH)
-                       sin.preferredSwSet.sw_Z = 1;
-               else if (surf->micro_tile_mode == RADEON_MICRO_MODE_RENDER)
-                       sin.preferredSwSet.sw_R = 1;
-       }
-
-       ret = Addr2GetPreferredSurfaceSetting(addrlib, &sin, &sout);
-       if (ret != ADDR_OK)
-               return ret;
-
-       *swizzle_mode = sout.swizzleMode;
-       return 0;
+   ADDR_E_RETURNCODE ret;
+   ADDR2_GET_PREFERRED_SURF_SETTING_INPUT sin = {0};
+   ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT sout = {0};
+
+   sin.size = sizeof(ADDR2_GET_PREFERRED_SURF_SETTING_INPUT);
+   sout.size = sizeof(ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT);
+
+   sin.flags = in->flags;
+   sin.resourceType = in->resourceType;
+   sin.format = in->format;
+   sin.resourceLoction = ADDR_RSRC_LOC_INVIS;
+   /* TODO: We could allow some of these: */
+   sin.forbiddenBlock.micro = 1; /* don't allow the 256B swizzle modes */
+   sin.forbiddenBlock.var = 1;   /* don't allow the variable-sized swizzle modes */
+   sin.bpp = in->bpp;
+   sin.width = in->width;
+   sin.height = in->height;
+   sin.numSlices = in->numSlices;
+   sin.numMipLevels = in->numMipLevels;
+   sin.numSamples = in->numSamples;
+   sin.numFrags = in->numFrags;
+
+   if (is_fmask) {
+      sin.flags.display = 0;
+      sin.flags.color = 0;
+      sin.flags.fmask = 1;
+   }
+
+   if (surf->flags & RADEON_SURF_FORCE_MICRO_TILE_MODE) {
+      sin.forbiddenBlock.linear = 1;
+
+      if (surf->micro_tile_mode == RADEON_MICRO_MODE_DISPLAY)
+         sin.preferredSwSet.sw_D = 1;
+      else if (surf->micro_tile_mode == RADEON_MICRO_MODE_STANDARD)
+         sin.preferredSwSet.sw_S = 1;
+      else if (surf->micro_tile_mode == RADEON_MICRO_MODE_DEPTH)
+         sin.preferredSwSet.sw_Z = 1;
+      else if (surf->micro_tile_mode == RADEON_MICRO_MODE_RENDER)
+         sin.preferredSwSet.sw_R = 1;
+   }
+
+   ret = Addr2GetPreferredSurfaceSetting(addrlib, &sin, &sout);
+   if (ret != ADDR_OK)
+      return ret;
+
+   *swizzle_mode = sout.swizzleMode;
+   return 0;
 }
 
 static bool is_dcc_supported_by_CB(const struct radeon_info *info, unsigned sw_mode)
 {
-       if (info->chip_class >= GFX10)
-               return sw_mode == ADDR_SW_64KB_Z_X || sw_mode == ADDR_SW_64KB_R_X;
+   if (info->chip_class >= GFX10)
+      return sw_mode == ADDR_SW_64KB_Z_X || sw_mode == ADDR_SW_64KB_R_X;
 
-       return sw_mode != ADDR_SW_LINEAR;
+   return sw_mode != ADDR_SW_LINEAR;
 }
 
 ASSERTED static bool is_dcc_supported_by_L2(const struct radeon_info *info,
-                                           const struct radeon_surf *surf)
+                                            const struct radeon_surf *surf)
 {
-       if (info->chip_class <= GFX9) {
-               /* Only independent 64B blocks are supported. */
-               return surf->u.gfx9.dcc.independent_64B_blocks &&
-                      !surf->u.gfx9.dcc.independent_128B_blocks &&
-                      surf->u.gfx9.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B;
-       }
-
-       if (info->family == CHIP_NAVI10) {
-               /* Only independent 128B blocks are supported. */
-               return !surf->u.gfx9.dcc.independent_64B_blocks &&
-                      surf->u.gfx9.dcc.independent_128B_blocks &&
-                      surf->u.gfx9.dcc.max_compressed_block_size <= V_028C78_MAX_BLOCK_SIZE_128B;
-       }
-
-       if (info->family == CHIP_NAVI12 ||
-           info->family == CHIP_NAVI14) {
-               /* Either 64B or 128B can be used, but not both.
-                * If 64B is used, DCC image stores are unsupported.
-                */
-               return surf->u.gfx9.dcc.independent_64B_blocks !=
-                      surf->u.gfx9.dcc.independent_128B_blocks &&
-                      (!surf->u.gfx9.dcc.independent_64B_blocks ||
-                       surf->u.gfx9.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B) &&
-                      (!surf->u.gfx9.dcc.independent_128B_blocks ||
-                       surf->u.gfx9.dcc.max_compressed_block_size <= V_028C78_MAX_BLOCK_SIZE_128B);
-       }
-
-       /* 128B is recommended, but 64B can be set too if needed for 4K by DCN.
-        * Since there is no reason to ever disable 128B, require it.
-        * DCC image stores are always supported.
-        */
-       return surf->u.gfx9.dcc.independent_128B_blocks &&
-              surf->u.gfx9.dcc.max_compressed_block_size <= V_028C78_MAX_BLOCK_SIZE_128B;
+   if (info->chip_class <= GFX9) {
+      /* Only independent 64B blocks are supported. */
+      return surf->u.gfx9.dcc.independent_64B_blocks && !surf->u.gfx9.dcc.independent_128B_blocks &&
+             surf->u.gfx9.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B;
+   }
+
+   if (info->family == CHIP_NAVI10) {
+      /* Only independent 128B blocks are supported. */
+      return !surf->u.gfx9.dcc.independent_64B_blocks && surf->u.gfx9.dcc.independent_128B_blocks &&
+             surf->u.gfx9.dcc.max_compressed_block_size <= V_028C78_MAX_BLOCK_SIZE_128B;
+   }
+
+   if (info->family == CHIP_NAVI12 || info->family == CHIP_NAVI14) {
+      /* Either 64B or 128B can be used, but not both.
+       * If 64B is used, DCC image stores are unsupported.
+       */
+      return surf->u.gfx9.dcc.independent_64B_blocks != surf->u.gfx9.dcc.independent_128B_blocks &&
+             (!surf->u.gfx9.dcc.independent_64B_blocks ||
+              surf->u.gfx9.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B) &&
+             (!surf->u.gfx9.dcc.independent_128B_blocks ||
+              surf->u.gfx9.dcc.max_compressed_block_size <= V_028C78_MAX_BLOCK_SIZE_128B);
+   }
+
+   /* 128B is recommended, but 64B can be set too if needed for 4K by DCN.
+    * Since there is no reason to ever disable 128B, require it.
+    * DCC image stores are always supported.
+    */
+   return surf->u.gfx9.dcc.independent_128B_blocks &&
+          surf->u.gfx9.dcc.max_compressed_block_size <= V_028C78_MAX_BLOCK_SIZE_128B;
 }
 
 static bool is_dcc_supported_by_DCN(const struct radeon_info *info,
-                                   const struct ac_surf_config *config,
-                                   const struct radeon_surf *surf,
-                                   bool rb_aligned, bool pipe_aligned)
+                                    const struct ac_surf_config *config,
+                                    const struct radeon_surf *surf, bool rb_aligned,
+                                    bool pipe_aligned)
 {
-       if (!info->use_display_dcc_unaligned &&
-           !info->use_display_dcc_with_retile_blit)
-               return false;
-
-       /* 16bpp and 64bpp are more complicated, so they are disallowed for now. */
-       if (surf->bpe != 4)
-               return false;
-
-       /* Handle unaligned DCC. */
-       if (info->use_display_dcc_unaligned &&
-           (rb_aligned || pipe_aligned))
-               return false;
-
-       switch (info->chip_class) {
-       case GFX9:
-               /* There are more constraints, but we always set
-                * INDEPENDENT_64B_BLOCKS = 1 and MAX_COMPRESSED_BLOCK_SIZE = 64B,
-                * which always works.
-                */
-               assert(surf->u.gfx9.dcc.independent_64B_blocks &&
-                      surf->u.gfx9.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B);
-               return true;
-       case GFX10:
-       case GFX10_3:
-               /* DCN requires INDEPENDENT_128B_BLOCKS = 0 only on Navi1x. */
-               if (info->chip_class == GFX10 &&
-                   surf->u.gfx9.dcc.independent_128B_blocks)
-                       return false;
-
-               /* For 4K, DCN requires INDEPENDENT_64B_BLOCKS = 1. */
-               return ((config->info.width <= 2560 &&
-                        config->info.height <= 2560) ||
-                       (surf->u.gfx9.dcc.independent_64B_blocks &&
-                        surf->u.gfx9.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B));
-       default:
-               unreachable("unhandled chip");
-               return false;
-       }
+   if (!info->use_display_dcc_unaligned && !info->use_display_dcc_with_retile_blit)
+      return false;
+
+   /* 16bpp and 64bpp are more complicated, so they are disallowed for now. */
+   if (surf->bpe != 4)
+      return false;
+
+   /* Handle unaligned DCC. */
+   if (info->use_display_dcc_unaligned && (rb_aligned || pipe_aligned))
+      return false;
+
+   switch (info->chip_class) {
+   case GFX9:
+      /* There are more constraints, but we always set
+       * INDEPENDENT_64B_BLOCKS = 1 and MAX_COMPRESSED_BLOCK_SIZE = 64B,
+       * which always works.
+       */
+      assert(surf->u.gfx9.dcc.independent_64B_blocks &&
+             surf->u.gfx9.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B);
+      return true;
+   case GFX10:
+   case GFX10_3:
+      /* DCN requires INDEPENDENT_128B_BLOCKS = 0 only on Navi1x. */
+      if (info->chip_class == GFX10 && surf->u.gfx9.dcc.independent_128B_blocks)
+         return false;
+
+      /* For 4K, DCN requires INDEPENDENT_64B_BLOCKS = 1. */
+      return ((config->info.width <= 2560 && config->info.height <= 2560) ||
+              (surf->u.gfx9.dcc.independent_64B_blocks &&
+               surf->u.gfx9.dcc.max_compressed_block_size == V_028C78_MAX_BLOCK_SIZE_64B));
+   default:
+      unreachable("unhandled chip");
+      return false;
+   }
 }
 
-static int gfx9_compute_miptree(struct ac_addrlib *addrlib,
-                               const struct radeon_info *info,
-                               const struct ac_surf_config *config,
-                               struct radeon_surf *surf, bool compressed,
-                               ADDR2_COMPUTE_SURFACE_INFO_INPUT *in)
+static int gfx9_compute_miptree(struct ac_addrlib *addrlib, const struct radeon_info *info,
+                                const struct ac_surf_config *config, struct radeon_surf *surf,
+                                bool compressed, ADDR2_COMPUTE_SURFACE_INFO_INPUT *in)
 {
-       ADDR2_MIP_INFO mip_info[RADEON_SURF_MAX_LEVELS] = {};
-       ADDR2_COMPUTE_SURFACE_INFO_OUTPUT out = {0};
-       ADDR_E_RETURNCODE ret;
-
-       out.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_OUTPUT);
-       out.pMipInfo = mip_info;
-
-       ret = Addr2ComputeSurfaceInfo(addrlib->handle, in, &out);
-       if (ret != ADDR_OK)
-               return ret;
-
-       if (in->flags.stencil) {
-               surf->u.gfx9.stencil.swizzle_mode = in->swizzleMode;
-               surf->u.gfx9.stencil.epitch = out.epitchIsHeight ? out.mipChainHeight - 1 :
-                                                                  out.mipChainPitch - 1;
-               surf->surf_alignment = MAX2(surf->surf_alignment, out.baseAlign);
-               surf->u.gfx9.stencil_offset = align(surf->surf_size, out.baseAlign);
-               surf->surf_size = surf->u.gfx9.stencil_offset + out.surfSize;
-               return 0;
-       }
-
-       surf->u.gfx9.surf.swizzle_mode = in->swizzleMode;
-       surf->u.gfx9.surf.epitch = out.epitchIsHeight ? out.mipChainHeight - 1 :
-                                                       out.mipChainPitch - 1;
-
-       /* CMASK fast clear uses these even if FMASK isn't allocated.
-        * FMASK only supports the Z swizzle modes, whose numbers are multiples of 4.
-        */
-       surf->u.gfx9.fmask.swizzle_mode = surf->u.gfx9.surf.swizzle_mode & ~0x3;
-       surf->u.gfx9.fmask.epitch = surf->u.gfx9.surf.epitch;
-
-       surf->u.gfx9.surf_slice_size = out.sliceSize;
-       surf->u.gfx9.surf_pitch = out.pitch;
-       surf->u.gfx9.surf_height = out.height;
-       surf->surf_size = out.surfSize;
-       surf->surf_alignment = out.baseAlign;
-
-       if (!compressed && surf->blk_w > 1 && out.pitch == out.pixelPitch &&
-           surf->u.gfx9.surf.swizzle_mode == ADDR_SW_LINEAR) {
-               /* Adjust surf_pitch to be in elements units not in pixels */
-               surf->u.gfx9.surf_pitch =
-                       align(surf->u.gfx9.surf_pitch / surf->blk_w, 256 / surf->bpe);
-               surf->u.gfx9.surf.epitch = MAX2(surf->u.gfx9.surf.epitch,
-                                               surf->u.gfx9.surf_pitch * surf->blk_w - 1);
-               /* The surface is really a surf->bpe bytes per pixel surface even if we
-                * use it as a surf->bpe bytes per element one.
-                * Adjust surf_slice_size and surf_size to reflect the change
-                * made to surf_pitch.
-                */
-               surf->u.gfx9.surf_slice_size = MAX2(
-                       surf->u.gfx9.surf_slice_size,
-                       surf->u.gfx9.surf_pitch * out.height * surf->bpe * surf->blk_w);
-               surf->surf_size = surf->u.gfx9.surf_slice_size * in->numSlices;
-       }
-
-       if (in->swizzleMode == ADDR_SW_LINEAR) {
-               for (unsigned i = 0; i < in->numMipLevels; i++) {
-                       surf->u.gfx9.offset[i] = mip_info[i].offset;
-                       surf->u.gfx9.pitch[i] = mip_info[i].pitch;
-               }
-       }
-
-       if (in->flags.depth) {
-               assert(in->swizzleMode != ADDR_SW_LINEAR);
-
-               if (surf->flags & RADEON_SURF_NO_HTILE)
-                       return 0;
-
-               /* HTILE */
-               ADDR2_COMPUTE_HTILE_INFO_INPUT hin = {0};
-               ADDR2_COMPUTE_HTILE_INFO_OUTPUT hout = {0};
-
-               hin.size = sizeof(ADDR2_COMPUTE_HTILE_INFO_INPUT);
-               hout.size = sizeof(ADDR2_COMPUTE_HTILE_INFO_OUTPUT);
-
-               assert(in->flags.metaPipeUnaligned == 0);
-               assert(in->flags.metaRbUnaligned == 0);
-
-               hin.hTileFlags.pipeAligned = 1;
-               hin.hTileFlags.rbAligned = 1;
-               hin.depthFlags = in->flags;
-               hin.swizzleMode = in->swizzleMode;
-               hin.unalignedWidth = in->width;
-               hin.unalignedHeight = in->height;
-               hin.numSlices = in->numSlices;
-               hin.numMipLevels = in->numMipLevels;
-               hin.firstMipIdInTail = out.firstMipIdInTail;
-
-               ret = Addr2ComputeHtileInfo(addrlib->handle, &hin, &hout);
-               if (ret != ADDR_OK)
-                       return ret;
-
-               surf->htile_size = hout.htileBytes;
-               surf->htile_slice_size = hout.sliceSize;
-               surf->htile_alignment = hout.baseAlign;
-               return 0;
-       }
-
-       {
-               /* Compute tile swizzle for the color surface.
-                * All *_X and *_T modes can use the swizzle.
-                */
-               if (config->info.surf_index &&
-                   in->swizzleMode >= ADDR_SW_64KB_Z_T &&
-                   !out.mipChainInTail &&
-                   !(surf->flags & RADEON_SURF_SHAREABLE) &&
-                   !in->flags.display) {
-                       ADDR2_COMPUTE_PIPEBANKXOR_INPUT xin = {0};
-                       ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT xout = {0};
-
-                       xin.size = sizeof(ADDR2_COMPUTE_PIPEBANKXOR_INPUT);
-                       xout.size = sizeof(ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT);
-
-                       xin.surfIndex = p_atomic_inc_return(config->info.surf_index) - 1;
-                       xin.flags = in->flags;
-                       xin.swizzleMode = in->swizzleMode;
-                       xin.resourceType = in->resourceType;
-                       xin.format = in->format;
-                       xin.numSamples = in->numSamples;
-                       xin.numFrags = in->numFrags;
-
-                       ret = Addr2ComputePipeBankXor(addrlib->handle, &xin, &xout);
-                       if (ret != ADDR_OK)
-                               return ret;
-
-                       assert(xout.pipeBankXor <=
-                              u_bit_consecutive(0, sizeof(surf->tile_swizzle) * 8));
-                       surf->tile_swizzle = xout.pipeBankXor;
-               }
-
-               /* DCC */
-               if (info->has_graphics &&
-                   !(surf->flags & RADEON_SURF_DISABLE_DCC) &&
-                   !compressed &&
-                   is_dcc_supported_by_CB(info, in->swizzleMode) &&
-                   (!in->flags.display ||
-                    is_dcc_supported_by_DCN(info, config, surf,
-                                            !in->flags.metaRbUnaligned,
-                                            !in->flags.metaPipeUnaligned))) {
-                       ADDR2_COMPUTE_DCCINFO_INPUT din = {0};
-                       ADDR2_COMPUTE_DCCINFO_OUTPUT dout = {0};
-                       ADDR2_META_MIP_INFO meta_mip_info[RADEON_SURF_MAX_LEVELS] = {};
-
-                       din.size = sizeof(ADDR2_COMPUTE_DCCINFO_INPUT);
-                       dout.size = sizeof(ADDR2_COMPUTE_DCCINFO_OUTPUT);
-                       dout.pMipInfo = meta_mip_info;
-
-                       din.dccKeyFlags.pipeAligned = !in->flags.metaPipeUnaligned;
-                       din.dccKeyFlags.rbAligned = !in->flags.metaRbUnaligned;
-                       din.resourceType = in->resourceType;
-                       din.swizzleMode = in->swizzleMode;
-                       din.bpp = in->bpp;
-                       din.unalignedWidth = in->width;
-                       din.unalignedHeight = in->height;
-                       din.numSlices = in->numSlices;
-                       din.numFrags = in->numFrags;
-                       din.numMipLevels = in->numMipLevels;
-                       din.dataSurfaceSize = out.surfSize;
-                       din.firstMipIdInTail = out.firstMipIdInTail;
-
-                       ret = Addr2ComputeDccInfo(addrlib->handle, &din, &dout);
-                       if (ret != ADDR_OK)
-                               return ret;
-
-                       surf->u.gfx9.dcc.rb_aligned = din.dccKeyFlags.rbAligned;
-                       surf->u.gfx9.dcc.pipe_aligned = din.dccKeyFlags.pipeAligned;
-                       surf->u.gfx9.dcc_block_width = dout.compressBlkWidth;
-                       surf->u.gfx9.dcc_block_height = dout.compressBlkHeight;
-                       surf->u.gfx9.dcc_block_depth = dout.compressBlkDepth;
-                       surf->dcc_size = dout.dccRamSize;
-                       surf->dcc_alignment = dout.dccRamBaseAlign;
-                       surf->num_dcc_levels = in->numMipLevels;
-
-                       /* Disable DCC for levels that are in the mip tail.
-                        *
-                        * There are two issues that this is intended to
-                        * address:
-                        *
-                        * 1. Multiple mip levels may share a cache line. This
-                        *    can lead to corruption when switching between
-                        *    rendering to different mip levels because the
-                        *    RBs don't maintain coherency.
-                        *
-                        * 2. Texturing with metadata after rendering sometimes
-                        *    fails with corruption, probably for a similar
-                        *    reason.
-                        *
-                        * Working around these issues for all levels in the
-                        * mip tail may be overly conservative, but it's what
-                        * Vulkan does.
-                        *
-                        * Alternative solutions that also work but are worse:
-                        * - Disable DCC entirely.
-                        * - Flush TC L2 after rendering.
-                        */
-                       for (unsigned i = 0; i < in->numMipLevels; i++) {
-                               if (meta_mip_info[i].inMiptail) {
-                                       /* GFX10 can only compress the first level
-                                        * in the mip tail.
-                                        *
-                                        * TODO: Try to do the same thing for gfx9
-                                        *       if there are no regressions.
-                                        */
-                                       if (info->chip_class >= GFX10)
-                                               surf->num_dcc_levels = i + 1;
-                                       else
-                                               surf->num_dcc_levels = i;
-                                       break;
-                               }
-                       }
-
-                       if (!surf->num_dcc_levels)
-                               surf->dcc_size = 0;
-
-                       surf->u.gfx9.display_dcc_size = surf->dcc_size;
-                       surf->u.gfx9.display_dcc_alignment = surf->dcc_alignment;
-                       surf->u.gfx9.display_dcc_pitch_max = dout.pitch - 1;
-
-                       /* Compute displayable DCC. */
-                       if (in->flags.display &&
-                           surf->num_dcc_levels &&
-                           info->use_display_dcc_with_retile_blit) {
-                               /* Compute displayable DCC info. */
-                               din.dccKeyFlags.pipeAligned = 0;
-                               din.dccKeyFlags.rbAligned = 0;
-
-                               assert(din.numSlices == 1);
-                               assert(din.numMipLevels == 1);
-                               assert(din.numFrags == 1);
-                               assert(surf->tile_swizzle == 0);
-                               assert(surf->u.gfx9.dcc.pipe_aligned ||
-                                      surf->u.gfx9.dcc.rb_aligned);
-
-                               ret = Addr2ComputeDccInfo(addrlib->handle, &din, &dout);
-                               if (ret != ADDR_OK)
-                                       return ret;
-
-                               surf->u.gfx9.display_dcc_size = dout.dccRamSize;
-                               surf->u.gfx9.display_dcc_alignment = dout.dccRamBaseAlign;
-                               surf->u.gfx9.display_dcc_pitch_max = dout.pitch - 1;
-                               assert(surf->u.gfx9.display_dcc_size <= surf->dcc_size);
-
-                               surf->u.gfx9.dcc_retile_use_uint16 =
-                                       surf->u.gfx9.display_dcc_size <= UINT16_MAX + 1 &&
-                                       surf->dcc_size <= UINT16_MAX + 1;
-
-                               /* Align the retile map size to get more hash table hits and
-                                * decrease the maximum memory footprint when all retile maps
-                                * are cached in the hash table.
-                                */
-                               unsigned retile_dim[2] = {in->width, in->height};
-
-                               for (unsigned i = 0; i < 2; i++) {
-                                       /* Increase the alignment as the size increases.
-                                        * Greater alignment increases retile compute work,
-                                        * but decreases maximum memory footprint for the cache.
-                                        *
-                                        * With this alignment, the worst case memory footprint of
-                                        * the cache is:
-                                        *   1920x1080: 55 MB
-                                        *   2560x1440: 99 MB
-                                        *   3840x2160: 305 MB
-                                        *
-                                        * The worst case size in MB can be computed in Haskell as follows:
-                                        *   (sum (map get_retile_size (map get_dcc_size (deduplicate (map align_pair
-                                        *       [(i*16,j*16) | i <- [1..maxwidth`div`16], j <- [1..maxheight`div`16]]))))) `div` 1024^2
-                                        *     where
-                                        *       alignment x = if x <= 512 then 16 else if x <= 1024 then 32 else if x <= 2048 then 64 else 128
-                                        *       align x = (x + (alignment x) - 1) `div` (alignment x) * (alignment x)
-                                        *       align_pair e = (align (fst e), align (snd e))
-                                        *       deduplicate = map head . groupBy (\ a b -> ((fst a) == (fst b)) && ((snd a) == (snd b))) . sortBy compare
-                                        *       get_dcc_size e = ((fst e) * (snd e) * bpp) `div` 256
-                                        *       get_retile_size dcc_size = dcc_size * 2 * (if dcc_size <= 2^16 then 2 else 4)
-                                        *       bpp = 4; maxwidth = 3840; maxheight = 2160
-                                        */
-                                       if (retile_dim[i] <= 512)
-                                               retile_dim[i] = align(retile_dim[i], 16);
-                                       else if (retile_dim[i] <= 1024)
-                                               retile_dim[i] = align(retile_dim[i], 32);
-                                       else if (retile_dim[i] <= 2048)
-                                               retile_dim[i] = align(retile_dim[i], 64);
-                                       else
-                                               retile_dim[i] = align(retile_dim[i], 128);
-
-                                       /* Don't align more than the DCC pixel alignment. */
-                                       assert(dout.metaBlkWidth >= 128 && dout.metaBlkHeight >= 128);
-                               }
-
-                               surf->u.gfx9.dcc_retile_num_elements =
-                                       DIV_ROUND_UP(retile_dim[0], dout.compressBlkWidth) *
-                                       DIV_ROUND_UP(retile_dim[1], dout.compressBlkHeight) * 2;
-                               /* Align the size to 4 (for the compute shader). */
-                               surf->u.gfx9.dcc_retile_num_elements =
-                                       align(surf->u.gfx9.dcc_retile_num_elements, 4);
-
-                               if (!(surf->flags & RADEON_SURF_IMPORTED)) {
-                                       /* Compute address mapping from non-displayable to displayable DCC. */
-                                       ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT addrin;
-                                       memset(&addrin, 0, sizeof(addrin));
-                                       addrin.size             = sizeof(addrin);
-                                       addrin.swizzleMode      = din.swizzleMode;
-                                       addrin.resourceType     = din.resourceType;
-                                       addrin.bpp              = din.bpp;
-                                       addrin.numSlices        = 1;
-                                       addrin.numMipLevels     = 1;
-                                       addrin.numFrags         = 1;
-                                       addrin.pitch            = dout.pitch;
-                                       addrin.height           = dout.height;
-                                       addrin.compressBlkWidth = dout.compressBlkWidth;
-                                       addrin.compressBlkHeight = dout.compressBlkHeight;
-                                       addrin.compressBlkDepth = dout.compressBlkDepth;
-                                       addrin.metaBlkWidth     = dout.metaBlkWidth;
-                                       addrin.metaBlkHeight    = dout.metaBlkHeight;
-                                       addrin.metaBlkDepth     = dout.metaBlkDepth;
-                                       addrin.dccRamSliceSize  = 0; /* Don't care for non-layered images. */
-
-                                       surf->u.gfx9.dcc_retile_map =
-                                               ac_compute_dcc_retile_map(addrlib, info,
-                                                                         retile_dim[0], retile_dim[1],
-                                                                         surf->u.gfx9.dcc.rb_aligned,
-                                                                         surf->u.gfx9.dcc.pipe_aligned,
-                                                                         surf->u.gfx9.dcc_retile_use_uint16,
-                                                                         surf->u.gfx9.dcc_retile_num_elements,
-                                                                         &addrin);
-                                       if (!surf->u.gfx9.dcc_retile_map)
-                                               return ADDR_OUTOFMEMORY;
-                               }
-                       }
-               }
-
-               /* FMASK */
-               if (in->numSamples > 1 && info->has_graphics &&
-                   !(surf->flags & RADEON_SURF_NO_FMASK)) {
-                       ADDR2_COMPUTE_FMASK_INFO_INPUT fin = {0};
-                       ADDR2_COMPUTE_FMASK_INFO_OUTPUT fout = {0};
-
-                       fin.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_INPUT);
-                       fout.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_OUTPUT);
-
-                       ret = gfx9_get_preferred_swizzle_mode(addrlib->handle, surf, in,
-                                                             true, &fin.swizzleMode);
-                       if (ret != ADDR_OK)
-                               return ret;
-
-                       fin.unalignedWidth = in->width;
-                       fin.unalignedHeight = in->height;
-                       fin.numSlices = in->numSlices;
-                       fin.numSamples = in->numSamples;
-                       fin.numFrags = in->numFrags;
-
-                       ret = Addr2ComputeFmaskInfo(addrlib->handle, &fin, &fout);
-                       if (ret != ADDR_OK)
-                               return ret;
-
-                       surf->u.gfx9.fmask.swizzle_mode = fin.swizzleMode;
-                       surf->u.gfx9.fmask.epitch = fout.pitch - 1;
-                       surf->fmask_size = fout.fmaskBytes;
-                       surf->fmask_alignment = fout.baseAlign;
-
-                       /* Compute tile swizzle for the FMASK surface. */
-                       if (config->info.fmask_surf_index &&
-                           fin.swizzleMode >= ADDR_SW_64KB_Z_T &&
-                           !(surf->flags & RADEON_SURF_SHAREABLE)) {
-                               ADDR2_COMPUTE_PIPEBANKXOR_INPUT xin = {0};
-                               ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT xout = {0};
-
-                               xin.size = sizeof(ADDR2_COMPUTE_PIPEBANKXOR_INPUT);
-                               xout.size = sizeof(ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT);
-
-                               /* This counter starts from 1 instead of 0. */
-                               xin.surfIndex = p_atomic_inc_return(config->info.fmask_surf_index);
-                               xin.flags = in->flags;
-                               xin.swizzleMode = fin.swizzleMode;
-                               xin.resourceType = in->resourceType;
-                               xin.format = in->format;
-                               xin.numSamples = in->numSamples;
-                               xin.numFrags = in->numFrags;
-
-                               ret = Addr2ComputePipeBankXor(addrlib->handle, &xin, &xout);
-                               if (ret != ADDR_OK)
-                                       return ret;
-
-                               assert(xout.pipeBankXor <=
-                                      u_bit_consecutive(0, sizeof(surf->fmask_tile_swizzle) * 8));
-                               surf->fmask_tile_swizzle = xout.pipeBankXor;
-                       }
-               }
-
-               /* CMASK -- on GFX10 only for FMASK */
-               if (in->swizzleMode != ADDR_SW_LINEAR &&
-                   in->resourceType == ADDR_RSRC_TEX_2D &&
-                   ((info->chip_class <= GFX9 &&
-                     in->numSamples == 1 &&
-                     in->flags.metaPipeUnaligned == 0 &&
-                     in->flags.metaRbUnaligned == 0) ||
-                    (surf->fmask_size && in->numSamples >= 2))) {
-                       ADDR2_COMPUTE_CMASK_INFO_INPUT cin = {0};
-                       ADDR2_COMPUTE_CMASK_INFO_OUTPUT cout = {0};
-
-                       cin.size = sizeof(ADDR2_COMPUTE_CMASK_INFO_INPUT);
-                       cout.size = sizeof(ADDR2_COMPUTE_CMASK_INFO_OUTPUT);
-
-                       assert(in->flags.metaPipeUnaligned == 0);
-                       assert(in->flags.metaRbUnaligned == 0);
-
-                       cin.cMaskFlags.pipeAligned = 1;
-                       cin.cMaskFlags.rbAligned = 1;
-                       cin.resourceType = in->resourceType;
-                       cin.unalignedWidth = in->width;
-                       cin.unalignedHeight = in->height;
-                       cin.numSlices = in->numSlices;
-
-                       if (in->numSamples > 1)
-                               cin.swizzleMode = surf->u.gfx9.fmask.swizzle_mode;
-                       else
-                               cin.swizzleMode = in->swizzleMode;
-
-                       ret = Addr2ComputeCmaskInfo(addrlib->handle, &cin, &cout);
-                       if (ret != ADDR_OK)
-                               return ret;
-
-                       surf->cmask_size = cout.cmaskBytes;
-                       surf->cmask_alignment = cout.baseAlign;
-               }
-       }
-
-       return 0;
+   ADDR2_MIP_INFO mip_info[RADEON_SURF_MAX_LEVELS] = {};
+   ADDR2_COMPUTE_SURFACE_INFO_OUTPUT out = {0};
+   ADDR_E_RETURNCODE ret;
+
+   out.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_OUTPUT);
+   out.pMipInfo = mip_info;
+
+   ret = Addr2ComputeSurfaceInfo(addrlib->handle, in, &out);
+   if (ret != ADDR_OK)
+      return ret;
+
+   if (in->flags.stencil) {
+      surf->u.gfx9.stencil.swizzle_mode = in->swizzleMode;
+      surf->u.gfx9.stencil.epitch =
+         out.epitchIsHeight ? out.mipChainHeight - 1 : out.mipChainPitch - 1;
+      surf->surf_alignment = MAX2(surf->surf_alignment, out.baseAlign);
+      surf->u.gfx9.stencil_offset = align(surf->surf_size, out.baseAlign);
+      surf->surf_size = surf->u.gfx9.stencil_offset + out.surfSize;
+      return 0;
+   }
+
+   surf->u.gfx9.surf.swizzle_mode = in->swizzleMode;
+   surf->u.gfx9.surf.epitch = out.epitchIsHeight ? out.mipChainHeight - 1 : out.mipChainPitch - 1;
+
+   /* CMASK fast clear uses these even if FMASK isn't allocated.
+    * FMASK only supports the Z swizzle modes, whose numbers are multiples of 4.
+    */
+   surf->u.gfx9.fmask.swizzle_mode = surf->u.gfx9.surf.swizzle_mode & ~0x3;
+   surf->u.gfx9.fmask.epitch = surf->u.gfx9.surf.epitch;
+
+   surf->u.gfx9.surf_slice_size = out.sliceSize;
+   surf->u.gfx9.surf_pitch = out.pitch;
+   surf->u.gfx9.surf_height = out.height;
+   surf->surf_size = out.surfSize;
+   surf->surf_alignment = out.baseAlign;
+
+   if (!compressed && surf->blk_w > 1 && out.pitch == out.pixelPitch &&
+       surf->u.gfx9.surf.swizzle_mode == ADDR_SW_LINEAR) {
+      /* Adjust surf_pitch to be in elements units not in pixels */
+      surf->u.gfx9.surf_pitch = align(surf->u.gfx9.surf_pitch / surf->blk_w, 256 / surf->bpe);
+      surf->u.gfx9.surf.epitch =
+         MAX2(surf->u.gfx9.surf.epitch, surf->u.gfx9.surf_pitch * surf->blk_w - 1);
+      /* The surface is really a surf->bpe bytes per pixel surface even if we
+       * use it as a surf->bpe bytes per element one.
+       * Adjust surf_slice_size and surf_size to reflect the change
+       * made to surf_pitch.
+       */
+      surf->u.gfx9.surf_slice_size =
+         MAX2(surf->u.gfx9.surf_slice_size,
+              surf->u.gfx9.surf_pitch * out.height * surf->bpe * surf->blk_w);
+      surf->surf_size = surf->u.gfx9.surf_slice_size * in->numSlices;
+   }
+
+   if (in->swizzleMode == ADDR_SW_LINEAR) {
+      for (unsigned i = 0; i < in->numMipLevels; i++) {
+         surf->u.gfx9.offset[i] = mip_info[i].offset;
+         surf->u.gfx9.pitch[i] = mip_info[i].pitch;
+      }
+   }
+
+   if (in->flags.depth) {
+      assert(in->swizzleMode != ADDR_SW_LINEAR);
+
+      if (surf->flags & RADEON_SURF_NO_HTILE)
+         return 0;
+
+      /* HTILE */
+      ADDR2_COMPUTE_HTILE_INFO_INPUT hin = {0};
+      ADDR2_COMPUTE_HTILE_INFO_OUTPUT hout = {0};
+
+      hin.size = sizeof(ADDR2_COMPUTE_HTILE_INFO_INPUT);
+      hout.size = sizeof(ADDR2_COMPUTE_HTILE_INFO_OUTPUT);
+
+      assert(in->flags.metaPipeUnaligned == 0);
+      assert(in->flags.metaRbUnaligned == 0);
+
+      hin.hTileFlags.pipeAligned = 1;
+      hin.hTileFlags.rbAligned = 1;
+      hin.depthFlags = in->flags;
+      hin.swizzleMode = in->swizzleMode;
+      hin.unalignedWidth = in->width;
+      hin.unalignedHeight = in->height;
+      hin.numSlices = in->numSlices;
+      hin.numMipLevels = in->numMipLevels;
+      hin.firstMipIdInTail = out.firstMipIdInTail;
+
+      ret = Addr2ComputeHtileInfo(addrlib->handle, &hin, &hout);
+      if (ret != ADDR_OK)
+         return ret;
+
+      surf->htile_size = hout.htileBytes;
+      surf->htile_slice_size = hout.sliceSize;
+      surf->htile_alignment = hout.baseAlign;
+      return 0;
+   }
+
+   {
+      /* Compute tile swizzle for the color surface.
+       * All *_X and *_T modes can use the swizzle.
+       */
+      if (config->info.surf_index && in->swizzleMode >= ADDR_SW_64KB_Z_T && !out.mipChainInTail &&
+          !(surf->flags & RADEON_SURF_SHAREABLE) && !in->flags.display) {
+         ADDR2_COMPUTE_PIPEBANKXOR_INPUT xin = {0};
+         ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT xout = {0};
+
+         xin.size = sizeof(ADDR2_COMPUTE_PIPEBANKXOR_INPUT);
+         xout.size = sizeof(ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT);
+
+         xin.surfIndex = p_atomic_inc_return(config->info.surf_index) - 1;
+         xin.flags = in->flags;
+         xin.swizzleMode = in->swizzleMode;
+         xin.resourceType = in->resourceType;
+         xin.format = in->format;
+         xin.numSamples = in->numSamples;
+         xin.numFrags = in->numFrags;
+
+         ret = Addr2ComputePipeBankXor(addrlib->handle, &xin, &xout);
+         if (ret != ADDR_OK)
+            return ret;
+
+         assert(xout.pipeBankXor <= u_bit_consecutive(0, sizeof(surf->tile_swizzle) * 8));
+         surf->tile_swizzle = xout.pipeBankXor;
+      }
+
+      /* DCC */
+      if (info->has_graphics && !(surf->flags & RADEON_SURF_DISABLE_DCC) && !compressed &&
+          is_dcc_supported_by_CB(info, in->swizzleMode) &&
+          (!in->flags.display ||
+           is_dcc_supported_by_DCN(info, config, surf, !in->flags.metaRbUnaligned,
+                                   !in->flags.metaPipeUnaligned))) {
+         ADDR2_COMPUTE_DCCINFO_INPUT din = {0};
+         ADDR2_COMPUTE_DCCINFO_OUTPUT dout = {0};
+         ADDR2_META_MIP_INFO meta_mip_info[RADEON_SURF_MAX_LEVELS] = {};
+
+         din.size = sizeof(ADDR2_COMPUTE_DCCINFO_INPUT);
+         dout.size = sizeof(ADDR2_COMPUTE_DCCINFO_OUTPUT);
+         dout.pMipInfo = meta_mip_info;
+
+         din.dccKeyFlags.pipeAligned = !in->flags.metaPipeUnaligned;
+         din.dccKeyFlags.rbAligned = !in->flags.metaRbUnaligned;
+         din.resourceType = in->resourceType;
+         din.swizzleMode = in->swizzleMode;
+         din.bpp = in->bpp;
+         din.unalignedWidth = in->width;
+         din.unalignedHeight = in->height;
+         din.numSlices = in->numSlices;
+         din.numFrags = in->numFrags;
+         din.numMipLevels = in->numMipLevels;
+         din.dataSurfaceSize = out.surfSize;
+         din.firstMipIdInTail = out.firstMipIdInTail;
+
+         ret = Addr2ComputeDccInfo(addrlib->handle, &din, &dout);
+         if (ret != ADDR_OK)
+            return ret;
+
+         surf->u.gfx9.dcc.rb_aligned = din.dccKeyFlags.rbAligned;
+         surf->u.gfx9.dcc.pipe_aligned = din.dccKeyFlags.pipeAligned;
+         surf->u.gfx9.dcc_block_width = dout.compressBlkWidth;
+         surf->u.gfx9.dcc_block_height = dout.compressBlkHeight;
+         surf->u.gfx9.dcc_block_depth = dout.compressBlkDepth;
+         surf->dcc_size = dout.dccRamSize;
+         surf->dcc_alignment = dout.dccRamBaseAlign;
+         surf->num_dcc_levels = in->numMipLevels;
+
+         /* Disable DCC for levels that are in the mip tail.
+          *
+          * There are two issues that this is intended to
+          * address:
+          *
+          * 1. Multiple mip levels may share a cache line. This
+          *    can lead to corruption when switching between
+          *    rendering to different mip levels because the
+          *    RBs don't maintain coherency.
+          *
+          * 2. Texturing with metadata after rendering sometimes
+          *    fails with corruption, probably for a similar
+          *    reason.
+          *
+          * Working around these issues for all levels in the
+          * mip tail may be overly conservative, but it's what
+          * Vulkan does.
+          *
+          * Alternative solutions that also work but are worse:
+          * - Disable DCC entirely.
+          * - Flush TC L2 after rendering.
+          */
+         for (unsigned i = 0; i < in->numMipLevels; i++) {
+            if (meta_mip_info[i].inMiptail) {
+               /* GFX10 can only compress the first level
+                * in the mip tail.
+                *
+                * TODO: Try to do the same thing for gfx9
+                *       if there are no regressions.
+                */
+               if (info->chip_class >= GFX10)
+                  surf->num_dcc_levels = i + 1;
+               else
+                  surf->num_dcc_levels = i;
+               break;
+            }
+         }
+
+         if (!surf->num_dcc_levels)
+            surf->dcc_size = 0;
+
+         surf->u.gfx9.display_dcc_size = surf->dcc_size;
+         surf->u.gfx9.display_dcc_alignment = surf->dcc_alignment;
+         surf->u.gfx9.display_dcc_pitch_max = dout.pitch - 1;
+
+         /* Compute displayable DCC. */
+         if (in->flags.display && surf->num_dcc_levels && info->use_display_dcc_with_retile_blit) {
+            /* Compute displayable DCC info. */
+            din.dccKeyFlags.pipeAligned = 0;
+            din.dccKeyFlags.rbAligned = 0;
+
+            assert(din.numSlices == 1);
+            assert(din.numMipLevels == 1);
+            assert(din.numFrags == 1);
+            assert(surf->tile_swizzle == 0);
+            assert(surf->u.gfx9.dcc.pipe_aligned || surf->u.gfx9.dcc.rb_aligned);
+
+            ret = Addr2ComputeDccInfo(addrlib->handle, &din, &dout);
+            if (ret != ADDR_OK)
+               return ret;
+
+            surf->u.gfx9.display_dcc_size = dout.dccRamSize;
+            surf->u.gfx9.display_dcc_alignment = dout.dccRamBaseAlign;
+            surf->u.gfx9.display_dcc_pitch_max = dout.pitch - 1;
+            assert(surf->u.gfx9.display_dcc_size <= surf->dcc_size);
+
+            surf->u.gfx9.dcc_retile_use_uint16 =
+               surf->u.gfx9.display_dcc_size <= UINT16_MAX + 1 && surf->dcc_size <= UINT16_MAX + 1;
+
+            /* Align the retile map size to get more hash table hits and
+             * decrease the maximum memory footprint when all retile maps
+             * are cached in the hash table.
+             */
+            unsigned retile_dim[2] = {in->width, in->height};
+
+            for (unsigned i = 0; i < 2; i++) {
+               /* Increase the alignment as the size increases.
+                * Greater alignment increases retile compute work,
+                * but decreases maximum memory footprint for the cache.
+                *
+                * With this alignment, the worst case memory footprint of
+                * the cache is:
+                *   1920x1080: 55 MB
+                *   2560x1440: 99 MB
+                *   3840x2160: 305 MB
+                *
+                * The worst case size in MB can be computed in Haskell as follows:
+                *   (sum (map get_retile_size (map get_dcc_size (deduplicate (map align_pair
+                *       [(i*16,j*16) | i <- [1..maxwidth`div`16], j <- [1..maxheight`div`16]])))))
+                * `div` 1024^2 where alignment x = if x <= 512 then 16 else if x <= 1024 then 32
+                * else if x <= 2048 then 64 else 128 align x = (x + (alignment x) - 1) `div`
+                * (alignment x) * (alignment x) align_pair e = (align (fst e), align (snd e))
+                *       deduplicate = map head . groupBy (\ a b -> ((fst a) == (fst b)) && ((snd a)
+                * == (snd b))) . sortBy compare get_dcc_size e = ((fst e) * (snd e) * bpp) `div` 256
+                *       get_retile_size dcc_size = dcc_size * 2 * (if dcc_size <= 2^16 then 2 else
+                * 4) bpp = 4; maxwidth = 3840; maxheight = 2160
+                */
+               if (retile_dim[i] <= 512)
+                  retile_dim[i] = align(retile_dim[i], 16);
+               else if (retile_dim[i] <= 1024)
+                  retile_dim[i] = align(retile_dim[i], 32);
+               else if (retile_dim[i] <= 2048)
+                  retile_dim[i] = align(retile_dim[i], 64);
+               else
+                  retile_dim[i] = align(retile_dim[i], 128);
+
+               /* Don't align more than the DCC pixel alignment. */
+               assert(dout.metaBlkWidth >= 128 && dout.metaBlkHeight >= 128);
+            }
+
+            surf->u.gfx9.dcc_retile_num_elements =
+               DIV_ROUND_UP(retile_dim[0], dout.compressBlkWidth) *
+               DIV_ROUND_UP(retile_dim[1], dout.compressBlkHeight) * 2;
+            /* Align the size to 4 (for the compute shader). */
+            surf->u.gfx9.dcc_retile_num_elements = align(surf->u.gfx9.dcc_retile_num_elements, 4);
+
+            if (!(surf->flags & RADEON_SURF_IMPORTED)) {
+               /* Compute address mapping from non-displayable to displayable DCC. */
+               ADDR2_COMPUTE_DCC_ADDRFROMCOORD_INPUT addrin;
+               memset(&addrin, 0, sizeof(addrin));
+               addrin.size = sizeof(addrin);
+               addrin.swizzleMode = din.swizzleMode;
+               addrin.resourceType = din.resourceType;
+               addrin.bpp = din.bpp;
+               addrin.numSlices = 1;
+               addrin.numMipLevels = 1;
+               addrin.numFrags = 1;
+               addrin.pitch = dout.pitch;
+               addrin.height = dout.height;
+               addrin.compressBlkWidth = dout.compressBlkWidth;
+               addrin.compressBlkHeight = dout.compressBlkHeight;
+               addrin.compressBlkDepth = dout.compressBlkDepth;
+               addrin.metaBlkWidth = dout.metaBlkWidth;
+               addrin.metaBlkHeight = dout.metaBlkHeight;
+               addrin.metaBlkDepth = dout.metaBlkDepth;
+               addrin.dccRamSliceSize = 0; /* Don't care for non-layered images. */
+
+               surf->u.gfx9.dcc_retile_map = ac_compute_dcc_retile_map(
+                  addrlib, info, retile_dim[0], retile_dim[1], surf->u.gfx9.dcc.rb_aligned,
+                  surf->u.gfx9.dcc.pipe_aligned, surf->u.gfx9.dcc_retile_use_uint16,
+                  surf->u.gfx9.dcc_retile_num_elements, &addrin);
+               if (!surf->u.gfx9.dcc_retile_map)
+                  return ADDR_OUTOFMEMORY;
+            }
+         }
+      }
+
+      /* FMASK */
+      if (in->numSamples > 1 && info->has_graphics && !(surf->flags & RADEON_SURF_NO_FMASK)) {
+         ADDR2_COMPUTE_FMASK_INFO_INPUT fin = {0};
+         ADDR2_COMPUTE_FMASK_INFO_OUTPUT fout = {0};
+
+         fin.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_INPUT);
+         fout.size = sizeof(ADDR2_COMPUTE_FMASK_INFO_OUTPUT);
+
+         ret = gfx9_get_preferred_swizzle_mode(addrlib->handle, surf, in, true, &fin.swizzleMode);
+         if (ret != ADDR_OK)
+            return ret;
+
+         fin.unalignedWidth = in->width;
+         fin.unalignedHeight = in->height;
+         fin.numSlices = in->numSlices;
+         fin.numSamples = in->numSamples;
+         fin.numFrags = in->numFrags;
+
+         ret = Addr2ComputeFmaskInfo(addrlib->handle, &fin, &fout);
+         if (ret != ADDR_OK)
+            return ret;
+
+         surf->u.gfx9.fmask.swizzle_mode = fin.swizzleMode;
+         surf->u.gfx9.fmask.epitch = fout.pitch - 1;
+         surf->fmask_size = fout.fmaskBytes;
+         surf->fmask_alignment = fout.baseAlign;
+
+         /* Compute tile swizzle for the FMASK surface. */
+         if (config->info.fmask_surf_index && fin.swizzleMode >= ADDR_SW_64KB_Z_T &&
+             !(surf->flags & RADEON_SURF_SHAREABLE)) {
+            ADDR2_COMPUTE_PIPEBANKXOR_INPUT xin = {0};
+            ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT xout = {0};
+
+            xin.size = sizeof(ADDR2_COMPUTE_PIPEBANKXOR_INPUT);
+            xout.size = sizeof(ADDR2_COMPUTE_PIPEBANKXOR_OUTPUT);
+
+            /* This counter starts from 1 instead of 0. */
+            xin.surfIndex = p_atomic_inc_return(config->info.fmask_surf_index);
+            xin.flags = in->flags;
+            xin.swizzleMode = fin.swizzleMode;
+            xin.resourceType = in->resourceType;
+            xin.format = in->format;
+            xin.numSamples = in->numSamples;
+            xin.numFrags = in->numFrags;
+
+            ret = Addr2ComputePipeBankXor(addrlib->handle, &xin, &xout);
+            if (ret != ADDR_OK)
+               return ret;
+
+            assert(xout.pipeBankXor <= u_bit_consecutive(0, sizeof(surf->fmask_tile_swizzle) * 8));
+            surf->fmask_tile_swizzle = xout.pipeBankXor;
+         }
+      }
+
+      /* CMASK -- on GFX10 only for FMASK */
+      if (in->swizzleMode != ADDR_SW_LINEAR && in->resourceType == ADDR_RSRC_TEX_2D &&
+          ((info->chip_class <= GFX9 && in->numSamples == 1 && in->flags.metaPipeUnaligned == 0 &&
+            in->flags.metaRbUnaligned == 0) ||
+           (surf->fmask_size && in->numSamples >= 2))) {
+         ADDR2_COMPUTE_CMASK_INFO_INPUT cin = {0};
+         ADDR2_COMPUTE_CMASK_INFO_OUTPUT cout = {0};
+
+         cin.size = sizeof(ADDR2_COMPUTE_CMASK_INFO_INPUT);
+         cout.size = sizeof(ADDR2_COMPUTE_CMASK_INFO_OUTPUT);
+
+         assert(in->flags.metaPipeUnaligned == 0);
+         assert(in->flags.metaRbUnaligned == 0);
+
+         cin.cMaskFlags.pipeAligned = 1;
+         cin.cMaskFlags.rbAligned = 1;
+         cin.resourceType = in->resourceType;
+         cin.unalignedWidth = in->width;
+         cin.unalignedHeight = in->height;
+         cin.numSlices = in->numSlices;
+
+         if (in->numSamples > 1)
+            cin.swizzleMode = surf->u.gfx9.fmask.swizzle_mode;
+         else
+            cin.swizzleMode = in->swizzleMode;
+
+         ret = Addr2ComputeCmaskInfo(addrlib->handle, &cin, &cout);
+         if (ret != ADDR_OK)
+            return ret;
+
+         surf->cmask_size = cout.cmaskBytes;
+         surf->cmask_alignment = cout.baseAlign;
+      }
+   }
+
+   return 0;
 }
 
-static int gfx9_compute_surface(struct ac_addrlib *addrlib,
-                               const struct radeon_info *info,
-                               const struct ac_surf_config *config,
-                               enum radeon_surf_mode mode,
-                               struct radeon_surf *surf)
+static int gfx9_compute_surface(struct ac_addrlib *addrlib, const struct radeon_info *info,
+                                const struct ac_surf_config *config, enum radeon_surf_mode mode,
+                                struct radeon_surf *surf)
 {
-       bool compressed;
-       ADDR2_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
-       int r;
-
-       AddrSurfInfoIn.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_INPUT);
-
-       compressed = surf->blk_w == 4 && surf->blk_h == 4;
-
-       /* The format must be set correctly for the allocation of compressed
-        * textures to work. In other cases, setting the bpp is sufficient. */
-       if (compressed) {
-               switch (surf->bpe) {
-               case 8:
-                       AddrSurfInfoIn.format = ADDR_FMT_BC1;
-                       break;
-               case 16:
-                       AddrSurfInfoIn.format = ADDR_FMT_BC3;
-                       break;
-               default:
-                       assert(0);
-               }
-       } else {
-               switch (surf->bpe) {
-               case 1:
-                       assert(!(surf->flags & RADEON_SURF_ZBUFFER));
-                       AddrSurfInfoIn.format = ADDR_FMT_8;
-                       break;
-               case 2:
-                       assert(surf->flags & RADEON_SURF_ZBUFFER ||
-                              !(surf->flags & RADEON_SURF_SBUFFER));
-                       AddrSurfInfoIn.format = ADDR_FMT_16;
-                       break;
-               case 4:
-                       assert(surf->flags & RADEON_SURF_ZBUFFER ||
-                              !(surf->flags & RADEON_SURF_SBUFFER));
-                       AddrSurfInfoIn.format = ADDR_FMT_32;
-                       break;
-               case 8:
-                       assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
-                       AddrSurfInfoIn.format = ADDR_FMT_32_32;
-                       break;
-               case 12:
-                       assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
-                       AddrSurfInfoIn.format = ADDR_FMT_32_32_32;
-                       break;
-               case 16:
-                       assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
-                       AddrSurfInfoIn.format = ADDR_FMT_32_32_32_32;
-                       break;
-               default:
-                       assert(0);
-               }
-               AddrSurfInfoIn.bpp = surf->bpe * 8;
-       }
-
-       bool is_color_surface = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
-       AddrSurfInfoIn.flags.color = is_color_surface &&
-                                    !(surf->flags & RADEON_SURF_NO_RENDER_TARGET);
-       AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0;
-       AddrSurfInfoIn.flags.display = get_display_flag(config, surf);
-       /* flags.texture currently refers to TC-compatible HTILE */
-       AddrSurfInfoIn.flags.texture = is_color_surface ||
-                                      surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE;
-       AddrSurfInfoIn.flags.opt4space = 1;
-
-       AddrSurfInfoIn.numMipLevels = config->info.levels;
-       AddrSurfInfoIn.numSamples = MAX2(1, config->info.samples);
-       AddrSurfInfoIn.numFrags = AddrSurfInfoIn.numSamples;
-
-       if (!(surf->flags & RADEON_SURF_Z_OR_SBUFFER))
-               AddrSurfInfoIn.numFrags = MAX2(1, config->info.storage_samples);
-
-       /* GFX9 doesn't support 1D depth textures, so allocate all 1D textures
-        * as 2D to avoid having shader variants for 1D vs 2D, so all shaders
-        * must sample 1D textures as 2D. */
-       if (config->is_3d)
-               AddrSurfInfoIn.resourceType = ADDR_RSRC_TEX_3D;
-       else if (info->chip_class != GFX9 && config->is_1d)
-               AddrSurfInfoIn.resourceType = ADDR_RSRC_TEX_1D;
-       else
-               AddrSurfInfoIn.resourceType = ADDR_RSRC_TEX_2D;
-
-       AddrSurfInfoIn.width = config->info.width;
-       AddrSurfInfoIn.height = config->info.height;
-
-       if (config->is_3d)
-               AddrSurfInfoIn.numSlices = config->info.depth;
-       else if (config->is_cube)
-               AddrSurfInfoIn.numSlices = 6;
-       else
-               AddrSurfInfoIn.numSlices = config->info.array_size;
-
-       /* This is propagated to DCC. It must be 0 for HTILE and CMASK. */
-       AddrSurfInfoIn.flags.metaPipeUnaligned = 0;
-       AddrSurfInfoIn.flags.metaRbUnaligned = 0;
-
-       /* Optimal values for the L2 cache. */
-       if (info->chip_class == GFX9) {
-               surf->u.gfx9.dcc.independent_64B_blocks = 1;
-               surf->u.gfx9.dcc.independent_128B_blocks = 0;
-               surf->u.gfx9.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
-       } else if (info->chip_class >= GFX10) {
-               surf->u.gfx9.dcc.independent_64B_blocks = 0;
-               surf->u.gfx9.dcc.independent_128B_blocks = 1;
-               surf->u.gfx9.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B;
-       }
-
-       if (AddrSurfInfoIn.flags.display) {
-               /* The display hardware can only read DCC with RB_ALIGNED=0 and
-                * PIPE_ALIGNED=0. PIPE_ALIGNED really means L2CACHE_ALIGNED.
-                *
-                * The CB block requires RB_ALIGNED=1 except 1 RB chips.
-                * PIPE_ALIGNED is optional, but PIPE_ALIGNED=0 requires L2 flushes
-                * after rendering, so PIPE_ALIGNED=1 is recommended.
-                */
-               if (info->use_display_dcc_unaligned) {
-                       AddrSurfInfoIn.flags.metaPipeUnaligned = 1;
-                       AddrSurfInfoIn.flags.metaRbUnaligned = 1;
-               }
-
-               /* Adjust DCC settings to meet DCN requirements. */
-               if (info->use_display_dcc_unaligned ||
-                   info->use_display_dcc_with_retile_blit) {
-                       /* Only Navi12/14 support independent 64B blocks in L2,
-                        * but without DCC image stores.
-                        */
-                       if (info->family == CHIP_NAVI12 ||
-                           info->family == CHIP_NAVI14) {
-                               surf->u.gfx9.dcc.independent_64B_blocks = 1;
-                               surf->u.gfx9.dcc.independent_128B_blocks = 0;
-                               surf->u.gfx9.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
-                       }
-
-                       if (info->chip_class >= GFX10_3) {
-                               surf->u.gfx9.dcc.independent_64B_blocks = 1;
-                               surf->u.gfx9.dcc.independent_128B_blocks = 1;
-                               surf->u.gfx9.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
-                       }
-               }
-       }
-
-       switch (mode) {
-       case RADEON_SURF_MODE_LINEAR_ALIGNED:
-               assert(config->info.samples <= 1);
-               assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
-               AddrSurfInfoIn.swizzleMode = ADDR_SW_LINEAR;
-               break;
-
-       case RADEON_SURF_MODE_1D:
-       case RADEON_SURF_MODE_2D:
-               if (surf->flags & RADEON_SURF_IMPORTED ||
-                    (info->chip_class >= GFX10 &&
-                     surf->flags & RADEON_SURF_FORCE_SWIZZLE_MODE)) {
-                       AddrSurfInfoIn.swizzleMode = surf->u.gfx9.surf.swizzle_mode;
-                       break;
-               }
-
-               r = gfx9_get_preferred_swizzle_mode(addrlib->handle, surf, &AddrSurfInfoIn,
-                                                   false, &AddrSurfInfoIn.swizzleMode);
-               if (r)
-                       return r;
-               break;
-
-       default:
-               assert(0);
-       }
-
-       surf->u.gfx9.resource_type = AddrSurfInfoIn.resourceType;
-       surf->has_stencil = !!(surf->flags & RADEON_SURF_SBUFFER);
-
-       surf->num_dcc_levels = 0;
-       surf->surf_size = 0;
-       surf->fmask_size = 0;
-       surf->dcc_size = 0;
-       surf->htile_size = 0;
-       surf->htile_slice_size = 0;
-       surf->u.gfx9.surf_offset = 0;
-       surf->u.gfx9.stencil_offset = 0;
-       surf->cmask_size = 0;
-       surf->u.gfx9.dcc_retile_use_uint16 = false;
-       surf->u.gfx9.dcc_retile_num_elements = 0;
-       surf->u.gfx9.dcc_retile_map = NULL;
-
-       /* Calculate texture layout information. */
-       r = gfx9_compute_miptree(addrlib, info, config, surf, compressed,
-                                &AddrSurfInfoIn);
-       if (r)
-               return r;
-
-       /* Calculate texture layout information for stencil. */
-       if (surf->flags & RADEON_SURF_SBUFFER) {
-               AddrSurfInfoIn.flags.stencil = 1;
-               AddrSurfInfoIn.bpp = 8;
-               AddrSurfInfoIn.format = ADDR_FMT_8;
-
-               if (!AddrSurfInfoIn.flags.depth) {
-                       r = gfx9_get_preferred_swizzle_mode(addrlib->handle, surf, &AddrSurfInfoIn,
-                                                           false, &AddrSurfInfoIn.swizzleMode);
-                       if (r)
-                               return r;
-               } else
-                       AddrSurfInfoIn.flags.depth = 0;
-
-               r = gfx9_compute_miptree(addrlib, info, config, surf, compressed,
-                                        &AddrSurfInfoIn);
-               if (r)
-                       return r;
-       }
-
-       surf->is_linear = surf->u.gfx9.surf.swizzle_mode == ADDR_SW_LINEAR;
-
-       /* Query whether the surface is displayable. */
-       /* This is only useful for surfaces that are allocated without SCANOUT. */
-       bool displayable = false;
-       if (!config->is_3d && !config->is_cube) {
-               r = Addr2IsValidDisplaySwizzleMode(addrlib->handle, surf->u.gfx9.surf.swizzle_mode,
-                                                  surf->bpe * 8, &displayable);
-               if (r)
-                       return r;
-
-               /* Display needs unaligned DCC. */
-               if (surf->num_dcc_levels &&
-                   (!is_dcc_supported_by_DCN(info, config, surf,
-                                             surf->u.gfx9.dcc.rb_aligned,
-                                             surf->u.gfx9.dcc.pipe_aligned) ||
-                    /* Don't set is_displayable if displayable DCC is missing. */
-                    (info->use_display_dcc_with_retile_blit &&
-                     !surf->u.gfx9.dcc_retile_num_elements)))
-                       displayable = false;
-       }
-       surf->is_displayable = displayable;
-
-       /* Validate that we allocated a displayable surface if requested. */
-       assert(!AddrSurfInfoIn.flags.display || surf->is_displayable);
-
-       /* Validate that DCC is set up correctly. */
-       if (surf->num_dcc_levels) {
-               assert(is_dcc_supported_by_L2(info, surf));
-               if (AddrSurfInfoIn.flags.color)
-                       assert(is_dcc_supported_by_CB(info, surf->u.gfx9.surf.swizzle_mode));
-               if (AddrSurfInfoIn.flags.display) {
-                       assert(is_dcc_supported_by_DCN(info, config, surf,
-                                                      surf->u.gfx9.dcc.rb_aligned,
-                                                      surf->u.gfx9.dcc.pipe_aligned));
-               }
-       }
-
-       if (info->has_graphics &&
-           !compressed &&
-           !config->is_3d &&
-           config->info.levels == 1 &&
-           AddrSurfInfoIn.flags.color &&
-           !surf->is_linear &&
-           surf->surf_alignment >= 64 * 1024 && /* 64KB tiling */
-           !(surf->flags & (RADEON_SURF_DISABLE_DCC |
-                            RADEON_SURF_FORCE_SWIZZLE_MODE |
-                            RADEON_SURF_FORCE_MICRO_TILE_MODE))) {
-               /* Validate that DCC is enabled if DCN can do it. */
-               if ((info->use_display_dcc_unaligned ||
-                    info->use_display_dcc_with_retile_blit) &&
-                   AddrSurfInfoIn.flags.display &&
-                   surf->bpe == 4) {
-                       assert(surf->num_dcc_levels);
-               }
-
-               /* Validate that non-scanout DCC is always enabled. */
-               if (!AddrSurfInfoIn.flags.display)
-                       assert(surf->num_dcc_levels);
-       }
-
-       if (!surf->htile_size) {
-               /* Unset this if HTILE is not present. */
-               surf->flags &= ~RADEON_SURF_TC_COMPATIBLE_HTILE;
-       }
-
-       switch (surf->u.gfx9.surf.swizzle_mode) {
-               /* S = standard. */
-               case ADDR_SW_256B_S:
-               case ADDR_SW_4KB_S:
-               case ADDR_SW_64KB_S:
-               case ADDR_SW_64KB_S_T:
-               case ADDR_SW_4KB_S_X:
-               case ADDR_SW_64KB_S_X:
-                       surf->micro_tile_mode = RADEON_MICRO_MODE_STANDARD;
-                       break;
-
-               /* D = display. */
-               case ADDR_SW_LINEAR:
-               case ADDR_SW_256B_D:
-               case ADDR_SW_4KB_D:
-               case ADDR_SW_64KB_D:
-               case ADDR_SW_64KB_D_T:
-               case ADDR_SW_4KB_D_X:
-               case ADDR_SW_64KB_D_X:
-                       surf->micro_tile_mode = RADEON_MICRO_MODE_DISPLAY;
-                       break;
-
-               /* R = rotated (gfx9), render target (gfx10). */
-               case ADDR_SW_256B_R:
-               case ADDR_SW_4KB_R:
-               case ADDR_SW_64KB_R:
-               case ADDR_SW_64KB_R_T:
-               case ADDR_SW_4KB_R_X:
-               case ADDR_SW_64KB_R_X:
-               case ADDR_SW_VAR_R_X:
-                       /* The rotated micro tile mode doesn't work if both CMASK and RB+ are
-                        * used at the same time. We currently do not use rotated
-                        * in gfx9.
-                        */
-                       assert(info->chip_class >= GFX10 ||
-                              !"rotate micro tile mode is unsupported");
-                       surf->micro_tile_mode = RADEON_MICRO_MODE_RENDER;
-                       break;
-
-               /* Z = depth. */
-               case ADDR_SW_4KB_Z:
-               case ADDR_SW_64KB_Z:
-               case ADDR_SW_64KB_Z_T:
-               case ADDR_SW_4KB_Z_X:
-               case ADDR_SW_64KB_Z_X:
-               case ADDR_SW_VAR_Z_X:
-                       surf->micro_tile_mode = RADEON_MICRO_MODE_DEPTH;
-                       break;
-
-               default:
-                       assert(0);
-       }
-
-       return 0;
+   bool compressed;
+   ADDR2_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
+   int r;
+
+   AddrSurfInfoIn.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_INPUT);
+
+   compressed = surf->blk_w == 4 && surf->blk_h == 4;
+
+   /* The format must be set correctly for the allocation of compressed
+    * textures to work. In other cases, setting the bpp is sufficient. */
+   if (compressed) {
+      switch (surf->bpe) {
+      case 8:
+         AddrSurfInfoIn.format = ADDR_FMT_BC1;
+         break;
+      case 16:
+         AddrSurfInfoIn.format = ADDR_FMT_BC3;
+         break;
+      default:
+         assert(0);
+      }
+   } else {
+      switch (surf->bpe) {
+      case 1:
+         assert(!(surf->flags & RADEON_SURF_ZBUFFER));
+         AddrSurfInfoIn.format = ADDR_FMT_8;
+         break;
+      case 2:
+         assert(surf->flags & RADEON_SURF_ZBUFFER || !(surf->flags & RADEON_SURF_SBUFFER));
+         AddrSurfInfoIn.format = ADDR_FMT_16;
+         break;
+      case 4:
+         assert(surf->flags & RADEON_SURF_ZBUFFER || !(surf->flags & RADEON_SURF_SBUFFER));
+         AddrSurfInfoIn.format = ADDR_FMT_32;
+         break;
+      case 8:
+         assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
+         AddrSurfInfoIn.format = ADDR_FMT_32_32;
+         break;
+      case 12:
+         assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
+         AddrSurfInfoIn.format = ADDR_FMT_32_32_32;
+         break;
+      case 16:
+         assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
+         AddrSurfInfoIn.format = ADDR_FMT_32_32_32_32;
+         break;
+      default:
+         assert(0);
+      }
+      AddrSurfInfoIn.bpp = surf->bpe * 8;
+   }
+
+   bool is_color_surface = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
+   AddrSurfInfoIn.flags.color = is_color_surface && !(surf->flags & RADEON_SURF_NO_RENDER_TARGET);
+   AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0;
+   AddrSurfInfoIn.flags.display = get_display_flag(config, surf);
+   /* flags.texture currently refers to TC-compatible HTILE */
+   AddrSurfInfoIn.flags.texture = is_color_surface || surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE;
+   AddrSurfInfoIn.flags.opt4space = 1;
+
+   AddrSurfInfoIn.numMipLevels = config->info.levels;
+   AddrSurfInfoIn.numSamples = MAX2(1, config->info.samples);
+   AddrSurfInfoIn.numFrags = AddrSurfInfoIn.numSamples;
+
+   if (!(surf->flags & RADEON_SURF_Z_OR_SBUFFER))
+      AddrSurfInfoIn.numFrags = MAX2(1, config->info.storage_samples);
+
+   /* GFX9 doesn't support 1D depth textures, so allocate all 1D textures
+    * as 2D to avoid having shader variants for 1D vs 2D, so all shaders
+    * must sample 1D textures as 2D. */
+   if (config->is_3d)
+      AddrSurfInfoIn.resourceType = ADDR_RSRC_TEX_3D;
+   else if (info->chip_class != GFX9 && config->is_1d)
+      AddrSurfInfoIn.resourceType = ADDR_RSRC_TEX_1D;
+   else
+      AddrSurfInfoIn.resourceType = ADDR_RSRC_TEX_2D;
+
+   AddrSurfInfoIn.width = config->info.width;
+   AddrSurfInfoIn.height = config->info.height;
+
+   if (config->is_3d)
+      AddrSurfInfoIn.numSlices = config->info.depth;
+   else if (config->is_cube)
+      AddrSurfInfoIn.numSlices = 6;
+   else
+      AddrSurfInfoIn.numSlices = config->info.array_size;
+
+   /* This is propagated to DCC. It must be 0 for HTILE and CMASK. */
+   AddrSurfInfoIn.flags.metaPipeUnaligned = 0;
+   AddrSurfInfoIn.flags.metaRbUnaligned = 0;
+
+   /* Optimal values for the L2 cache. */
+   if (info->chip_class == GFX9) {
+      surf->u.gfx9.dcc.independent_64B_blocks = 1;
+      surf->u.gfx9.dcc.independent_128B_blocks = 0;
+      surf->u.gfx9.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
+   } else if (info->chip_class >= GFX10) {
+      surf->u.gfx9.dcc.independent_64B_blocks = 0;
+      surf->u.gfx9.dcc.independent_128B_blocks = 1;
+      surf->u.gfx9.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B;
+   }
+
+   if (AddrSurfInfoIn.flags.display) {
+      /* The display hardware can only read DCC with RB_ALIGNED=0 and
+       * PIPE_ALIGNED=0. PIPE_ALIGNED really means L2CACHE_ALIGNED.
+       *
+       * The CB block requires RB_ALIGNED=1 except 1 RB chips.
+       * PIPE_ALIGNED is optional, but PIPE_ALIGNED=0 requires L2 flushes
+       * after rendering, so PIPE_ALIGNED=1 is recommended.
+       */
+      if (info->use_display_dcc_unaligned) {
+         AddrSurfInfoIn.flags.metaPipeUnaligned = 1;
+         AddrSurfInfoIn.flags.metaRbUnaligned = 1;
+      }
+
+      /* Adjust DCC settings to meet DCN requirements. */
+      if (info->use_display_dcc_unaligned || info->use_display_dcc_with_retile_blit) {
+         /* Only Navi12/14 support independent 64B blocks in L2,
+          * but without DCC image stores.
+          */
+         if (info->family == CHIP_NAVI12 || info->family == CHIP_NAVI14) {
+            surf->u.gfx9.dcc.independent_64B_blocks = 1;
+            surf->u.gfx9.dcc.independent_128B_blocks = 0;
+            surf->u.gfx9.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
+         }
+
+         if (info->chip_class >= GFX10_3) {
+            surf->u.gfx9.dcc.independent_64B_blocks = 1;
+            surf->u.gfx9.dcc.independent_128B_blocks = 1;
+            surf->u.gfx9.dcc.max_compressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
+         }
+      }
+   }
+
+   switch (mode) {
+   case RADEON_SURF_MODE_LINEAR_ALIGNED:
+      assert(config->info.samples <= 1);
+      assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
+      AddrSurfInfoIn.swizzleMode = ADDR_SW_LINEAR;
+      break;
+
+   case RADEON_SURF_MODE_1D:
+   case RADEON_SURF_MODE_2D:
+      if (surf->flags & RADEON_SURF_IMPORTED ||
+          (info->chip_class >= GFX10 && surf->flags & RADEON_SURF_FORCE_SWIZZLE_MODE)) {
+         AddrSurfInfoIn.swizzleMode = surf->u.gfx9.surf.swizzle_mode;
+         break;
+      }
+
+      r = gfx9_get_preferred_swizzle_mode(addrlib->handle, surf, &AddrSurfInfoIn, false,
+                                          &AddrSurfInfoIn.swizzleMode);
+      if (r)
+         return r;
+      break;
+
+   default:
+      assert(0);
+   }
+
+   surf->u.gfx9.resource_type = AddrSurfInfoIn.resourceType;
+   surf->has_stencil = !!(surf->flags & RADEON_SURF_SBUFFER);
+
+   surf->num_dcc_levels = 0;
+   surf->surf_size = 0;
+   surf->fmask_size = 0;
+   surf->dcc_size = 0;
+   surf->htile_size = 0;
+   surf->htile_slice_size = 0;
+   surf->u.gfx9.surf_offset = 0;
+   surf->u.gfx9.stencil_offset = 0;
+   surf->cmask_size = 0;
+   surf->u.gfx9.dcc_retile_use_uint16 = false;
+   surf->u.gfx9.dcc_retile_num_elements = 0;
+   surf->u.gfx9.dcc_retile_map = NULL;
+
+   /* Calculate texture layout information. */
+   r = gfx9_compute_miptree(addrlib, info, config, surf, compressed, &AddrSurfInfoIn);
+   if (r)
+      return r;
+
+   /* Calculate texture layout information for stencil. */
+   if (surf->flags & RADEON_SURF_SBUFFER) {
+      AddrSurfInfoIn.flags.stencil = 1;
+      AddrSurfInfoIn.bpp = 8;
+      AddrSurfInfoIn.format = ADDR_FMT_8;
+
+      if (!AddrSurfInfoIn.flags.depth) {
+         r = gfx9_get_preferred_swizzle_mode(addrlib->handle, surf, &AddrSurfInfoIn, false,
+                                             &AddrSurfInfoIn.swizzleMode);
+         if (r)
+            return r;
+      } else
+         AddrSurfInfoIn.flags.depth = 0;
+
+      r = gfx9_compute_miptree(addrlib, info, config, surf, compressed, &AddrSurfInfoIn);
+      if (r)
+         return r;
+   }
+
+   surf->is_linear = surf->u.gfx9.surf.swizzle_mode == ADDR_SW_LINEAR;
+
+   /* Query whether the surface is displayable. */
+   /* This is only useful for surfaces that are allocated without SCANOUT. */
+   bool displayable = false;
+   if (!config->is_3d && !config->is_cube) {
+      r = Addr2IsValidDisplaySwizzleMode(addrlib->handle, surf->u.gfx9.surf.swizzle_mode,
+                                         surf->bpe * 8, &displayable);
+      if (r)
+         return r;
+
+      /* Display needs unaligned DCC. */
+      if (surf->num_dcc_levels &&
+          (!is_dcc_supported_by_DCN(info, config, surf, surf->u.gfx9.dcc.rb_aligned,
+                                    surf->u.gfx9.dcc.pipe_aligned) ||
+           /* Don't set is_displayable if displayable DCC is missing. */
+           (info->use_display_dcc_with_retile_blit && !surf->u.gfx9.dcc_retile_num_elements)))
+         displayable = false;
+   }
+   surf->is_displayable = displayable;
+
+   /* Validate that we allocated a displayable surface if requested. */
+   assert(!AddrSurfInfoIn.flags.display || surf->is_displayable);
+
+   /* Validate that DCC is set up correctly. */
+   if (surf->num_dcc_levels) {
+      assert(is_dcc_supported_by_L2(info, surf));
+      if (AddrSurfInfoIn.flags.color)
+         assert(is_dcc_supported_by_CB(info, surf->u.gfx9.surf.swizzle_mode));
+      if (AddrSurfInfoIn.flags.display) {
+         assert(is_dcc_supported_by_DCN(info, config, surf, surf->u.gfx9.dcc.rb_aligned,
+                                        surf->u.gfx9.dcc.pipe_aligned));
+      }
+   }
+
+   if (info->has_graphics && !compressed && !config->is_3d && config->info.levels == 1 &&
+       AddrSurfInfoIn.flags.color && !surf->is_linear &&
+       surf->surf_alignment >= 64 * 1024 && /* 64KB tiling */
+       !(surf->flags & (RADEON_SURF_DISABLE_DCC | RADEON_SURF_FORCE_SWIZZLE_MODE |
+                        RADEON_SURF_FORCE_MICRO_TILE_MODE))) {
+      /* Validate that DCC is enabled if DCN can do it. */
+      if ((info->use_display_dcc_unaligned || info->use_display_dcc_with_retile_blit) &&
+          AddrSurfInfoIn.flags.display && surf->bpe == 4) {
+         assert(surf->num_dcc_levels);
+      }
+
+      /* Validate that non-scanout DCC is always enabled. */
+      if (!AddrSurfInfoIn.flags.display)
+         assert(surf->num_dcc_levels);
+   }
+
+   if (!surf->htile_size) {
+      /* Unset this if HTILE is not present. */
+      surf->flags &= ~RADEON_SURF_TC_COMPATIBLE_HTILE;
+   }
+
+   switch (surf->u.gfx9.surf.swizzle_mode) {
+   /* S = standard. */
+   case ADDR_SW_256B_S:
+   case ADDR_SW_4KB_S:
+   case ADDR_SW_64KB_S:
+   case ADDR_SW_64KB_S_T:
+   case ADDR_SW_4KB_S_X:
+   case ADDR_SW_64KB_S_X:
+      surf->micro_tile_mode = RADEON_MICRO_MODE_STANDARD;
+      break;
+
+   /* D = display. */
+   case ADDR_SW_LINEAR:
+   case ADDR_SW_256B_D:
+   case ADDR_SW_4KB_D:
+   case ADDR_SW_64KB_D:
+   case ADDR_SW_64KB_D_T:
+   case ADDR_SW_4KB_D_X:
+   case ADDR_SW_64KB_D_X:
+      surf->micro_tile_mode = RADEON_MICRO_MODE_DISPLAY;
+      break;
+
+   /* R = rotated (gfx9), render target (gfx10). */
+   case ADDR_SW_256B_R:
+   case ADDR_SW_4KB_R:
+   case ADDR_SW_64KB_R:
+   case ADDR_SW_64KB_R_T:
+   case ADDR_SW_4KB_R_X:
+   case ADDR_SW_64KB_R_X:
+   case ADDR_SW_VAR_R_X:
+      /* The rotated micro tile mode doesn't work if both CMASK and RB+ are
+       * used at the same time. We currently do not use rotated
+       * in gfx9.
+       */
+      assert(info->chip_class >= GFX10 || !"rotate micro tile mode is unsupported");
+      surf->micro_tile_mode = RADEON_MICRO_MODE_RENDER;
+      break;
+
+   /* Z = depth. */
+   case ADDR_SW_4KB_Z:
+   case ADDR_SW_64KB_Z:
+   case ADDR_SW_64KB_Z_T:
+   case ADDR_SW_4KB_Z_X:
+   case ADDR_SW_64KB_Z_X:
+   case ADDR_SW_VAR_Z_X:
+      surf->micro_tile_mode = RADEON_MICRO_MODE_DEPTH;
+      break;
+
+   default:
+      assert(0);
+   }
+
+   return 0;
 }
 
 int ac_compute_surface(struct ac_addrlib *addrlib, const struct radeon_info *info,
-                      const struct ac_surf_config *config,
-                      enum radeon_surf_mode mode,
-                      struct radeon_surf *surf)
+                       const struct ac_surf_config *config, enum radeon_surf_mode mode,
+                       struct radeon_surf *surf)
 {
-       int r;
-
-       r = surf_config_sanity(config, surf->flags);
-       if (r)
-               return r;
-
-       if (info->chip_class >= GFX9)
-               r = gfx9_compute_surface(addrlib, info, config, mode, surf);
-       else
-               r = gfx6_compute_surface(addrlib->handle, info, config, mode, surf);
-
-       if (r)
-               return r;
-
-       /* Determine the memory layout of multiple allocations in one buffer. */
-       surf->total_size = surf->surf_size;
-       surf->alignment = surf->surf_alignment;
-
-       if (surf->htile_size) {
-               surf->htile_offset = align64(surf->total_size, surf->htile_alignment);
-               surf->total_size = surf->htile_offset + surf->htile_size;
-               surf->alignment = MAX2(surf->alignment, surf->htile_alignment);
-       }
-
-       if (surf->fmask_size) {
-               assert(config->info.samples >= 2);
-               surf->fmask_offset = align64(surf->total_size, surf->fmask_alignment);
-               surf->total_size = surf->fmask_offset + surf->fmask_size;
-               surf->alignment = MAX2(surf->alignment, surf->fmask_alignment);
-       }
-
-       /* Single-sample CMASK is in a separate buffer. */
-       if (surf->cmask_size && config->info.samples >= 2) {
-               surf->cmask_offset = align64(surf->total_size, surf->cmask_alignment);
-               surf->total_size = surf->cmask_offset + surf->cmask_size;
-               surf->alignment = MAX2(surf->alignment, surf->cmask_alignment);
-       }
-
-       if (surf->is_displayable)
-               surf->flags |= RADEON_SURF_SCANOUT;
-
-       if (surf->dcc_size &&
-           /* dcc_size is computed on GFX9+ only if it's displayable. */
-           (info->chip_class >= GFX9 || !get_display_flag(config, surf))) {
-               /* It's better when displayable DCC is immediately after
-                * the image due to hw-specific reasons.
-                */
-               if (info->chip_class >= GFX9 &&
-                   surf->u.gfx9.dcc_retile_num_elements) {
-                       /* Add space for the displayable DCC buffer. */
-                       surf->display_dcc_offset =
-                               align64(surf->total_size, surf->u.gfx9.display_dcc_alignment);
-                       surf->total_size = surf->display_dcc_offset +
-                                          surf->u.gfx9.display_dcc_size;
-
-                       /* Add space for the DCC retile buffer. (16-bit or 32-bit elements) */
-                       surf->dcc_retile_map_offset =
-                               align64(surf->total_size, info->tcc_cache_line_size);
-
-                       if (surf->u.gfx9.dcc_retile_use_uint16) {
-                               surf->total_size = surf->dcc_retile_map_offset +
-                                                  surf->u.gfx9.dcc_retile_num_elements * 2;
-                       } else {
-                               surf->total_size = surf->dcc_retile_map_offset +
-                                                  surf->u.gfx9.dcc_retile_num_elements * 4;
-                       }
-               }
-
-               surf->dcc_offset = align64(surf->total_size, surf->dcc_alignment);
-               surf->total_size = surf->dcc_offset + surf->dcc_size;
-               surf->alignment = MAX2(surf->alignment, surf->dcc_alignment);
-       }
-
-       return 0;
+   int r;
+
+   r = surf_config_sanity(config, surf->flags);
+   if (r)
+      return r;
+
+   if (info->chip_class >= GFX9)
+      r = gfx9_compute_surface(addrlib, info, config, mode, surf);
+   else
+      r = gfx6_compute_surface(addrlib->handle, info, config, mode, surf);
+
+   if (r)
+      return r;
+
+   /* Determine the memory layout of multiple allocations in one buffer. */
+   surf->total_size = surf->surf_size;
+   surf->alignment = surf->surf_alignment;
+
+   if (surf->htile_size) {
+      surf->htile_offset = align64(surf->total_size, surf->htile_alignment);
+      surf->total_size = surf->htile_offset + surf->htile_size;
+      surf->alignment = MAX2(surf->alignment, surf->htile_alignment);
+   }
+
+   if (surf->fmask_size) {
+      assert(config->info.samples >= 2);
+      surf->fmask_offset = align64(surf->total_size, surf->fmask_alignment);
+      surf->total_size = surf->fmask_offset + surf->fmask_size;
+      surf->alignment = MAX2(surf->alignment, surf->fmask_alignment);
+   }
+
+   /* Single-sample CMASK is in a separate buffer. */
+   if (surf->cmask_size && config->info.samples >= 2) {
+      surf->cmask_offset = align64(surf->total_size, surf->cmask_alignment);
+      surf->total_size = surf->cmask_offset + surf->cmask_size;
+      surf->alignment = MAX2(surf->alignment, surf->cmask_alignment);
+   }
+
+   if (surf->is_displayable)
+      surf->flags |= RADEON_SURF_SCANOUT;
+
+   if (surf->dcc_size &&
+       /* dcc_size is computed on GFX9+ only if it's displayable. */
+       (info->chip_class >= GFX9 || !get_display_flag(config, surf))) {
+      /* It's better when displayable DCC is immediately after
+       * the image due to hw-specific reasons.
+       */
+      if (info->chip_class >= GFX9 && surf->u.gfx9.dcc_retile_num_elements) {
+         /* Add space for the displayable DCC buffer. */
+         surf->display_dcc_offset = align64(surf->total_size, surf->u.gfx9.display_dcc_alignment);
+         surf->total_size = surf->display_dcc_offset + surf->u.gfx9.display_dcc_size;
+
+         /* Add space for the DCC retile buffer. (16-bit or 32-bit elements) */
+         surf->dcc_retile_map_offset = align64(surf->total_size, info->tcc_cache_line_size);
+
+         if (surf->u.gfx9.dcc_retile_use_uint16) {
+            surf->total_size =
+               surf->dcc_retile_map_offset + surf->u.gfx9.dcc_retile_num_elements * 2;
+         } else {
+            surf->total_size =
+               surf->dcc_retile_map_offset + surf->u.gfx9.dcc_retile_num_elements * 4;
+         }
+      }
+
+      surf->dcc_offset = align64(surf->total_size, surf->dcc_alignment);
+      surf->total_size = surf->dcc_offset + surf->dcc_size;
+      surf->alignment = MAX2(surf->alignment, surf->dcc_alignment);
+   }
+
+   return 0;
 }
 
 /* This is meant to be used for disabling DCC. */
@@ -2289,14 +2166,28 @@ void ac_surface_zero_dcc_fields(struct radeon_surf *surf)
 static unsigned eg_tile_split(unsigned tile_split)
 {
    switch (tile_split) {
-   case 0:     tile_split = 64;    break;
-   case 1:     tile_split = 128;   break;
-   case 2:     tile_split = 256;   break;
-   case 3:     tile_split = 512;   break;
+   case 0:
+      tile_split = 64;
+      break;
+   case 1:
+      tile_split = 128;
+      break;
+   case 2:
+      tile_split = 256;
+      break;
+   case 3:
+      tile_split = 512;
+      break;
    default:
-   case 4:     tile_split = 1024;  break;
-   case 5:     tile_split = 2048;  break;
-   case 6:     tile_split = 4096;  break;
+   case 4:
+      tile_split = 1024;
+      break;
+   case 5:
+      tile_split = 2048;
+      break;
+   case 6:
+      tile_split = 4096;
+      break;
    }
    return tile_split;
 }
@@ -2304,35 +2195,45 @@ static unsigned eg_tile_split(unsigned tile_split)
 static unsigned eg_tile_split_rev(unsigned eg_tile_split)
 {
    switch (eg_tile_split) {
-   case 64:    return 0;
-   case 128:   return 1;
-   case 256:   return 2;
-   case 512:   return 3;
+   case 64:
+      return 0;
+   case 128:
+      return 1;
+   case 256:
+      return 2;
+   case 512:
+      return 3;
    default:
-   case 1024:  return 4;
-   case 2048:  return 5;
-   case 4096:  return 6;
+   case 1024:
+      return 4;
+   case 2048:
+      return 5;
+   case 4096:
+      return 6;
    }
 }
 
-#define AMDGPU_TILING_DCC_MAX_COMPRESSED_BLOCK_SIZE_SHIFT  45
-#define AMDGPU_TILING_DCC_MAX_COMPRESSED_BLOCK_SIZE_MASK   0x3
+#define AMDGPU_TILING_DCC_MAX_COMPRESSED_BLOCK_SIZE_SHIFT 45
+#define AMDGPU_TILING_DCC_MAX_COMPRESSED_BLOCK_SIZE_MASK  0x3
 
 /* This should be called before ac_compute_surface. */
-void ac_surface_set_bo_metadata(const struct radeon_info *info,
-                                struct radeon_surf *surf, uint64_t tiling_flags,
-                                enum radeon_surf_mode *mode)
+void ac_surface_set_bo_metadata(const struct radeon_info *info, struct radeon_surf *surf,
+                                uint64_t tiling_flags, enum radeon_surf_mode *mode)
 {
    bool scanout;
 
    if (info->chip_class >= GFX9) {
       surf->u.gfx9.surf.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, SWIZZLE_MODE);
-      surf->u.gfx9.dcc.independent_64B_blocks = AMDGPU_TILING_GET(tiling_flags, DCC_INDEPENDENT_64B);
-      surf->u.gfx9.dcc.independent_128B_blocks = AMDGPU_TILING_GET(tiling_flags, DCC_INDEPENDENT_128B);
-      surf->u.gfx9.dcc.max_compressed_block_size = AMDGPU_TILING_GET(tiling_flags, DCC_MAX_COMPRESSED_BLOCK_SIZE);
+      surf->u.gfx9.dcc.independent_64B_blocks =
+         AMDGPU_TILING_GET(tiling_flags, DCC_INDEPENDENT_64B);
+      surf->u.gfx9.dcc.independent_128B_blocks =
+         AMDGPU_TILING_GET(tiling_flags, DCC_INDEPENDENT_128B);
+      surf->u.gfx9.dcc.max_compressed_block_size =
+         AMDGPU_TILING_GET(tiling_flags, DCC_MAX_COMPRESSED_BLOCK_SIZE);
       surf->u.gfx9.display_dcc_pitch_max = AMDGPU_TILING_GET(tiling_flags, DCC_PITCH_MAX);
       scanout = AMDGPU_TILING_GET(tiling_flags, SCANOUT);
-      *mode = surf->u.gfx9.surf.swizzle_mode > 0 ? RADEON_SURF_MODE_2D : RADEON_SURF_MODE_LINEAR_ALIGNED;
+      *mode =
+         surf->u.gfx9.surf.swizzle_mode > 0 ? RADEON_SURF_MODE_2D : RADEON_SURF_MODE_LINEAR_ALIGNED;
    } else {
       surf->u.legacy.pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG);
       surf->u.legacy.bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH);
@@ -2342,7 +2243,7 @@ void ac_surface_set_bo_metadata(const struct radeon_info *info,
       surf->u.legacy.num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS);
       scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */
 
-      if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4)  /* 2D_TILED_THIN1 */
+      if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4) /* 2D_TILED_THIN1 */
          *mode = RADEON_SURF_MODE_2D;
       else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */
          *mode = RADEON_SURF_MODE_1D;
@@ -2356,8 +2257,8 @@ void ac_surface_set_bo_metadata(const struct radeon_info *info,
       surf->flags &= ~RADEON_SURF_SCANOUT;
 }
 
-void ac_surface_get_bo_metadata(const struct radeon_info *info,
-                                struct radeon_surf *surf, uint64_t *tiling_flags)
+void ac_surface_get_bo_metadata(const struct radeon_info *info, struct radeon_surf *surf,
+                                uint64_t *tiling_flags)
 {
    *tiling_flags = 0;
 
@@ -2365,17 +2266,19 @@ void ac_surface_get_bo_metadata(const struct radeon_info *info,
       uint64_t dcc_offset = 0;
 
       if (surf->dcc_offset) {
-         dcc_offset = surf->display_dcc_offset ? surf->display_dcc_offset
-                                               : surf->dcc_offset;
+         dcc_offset = surf->display_dcc_offset ? surf->display_dcc_offset : surf->dcc_offset;
          assert((dcc_offset >> 8) != 0 && (dcc_offset >> 8) < (1 << 24));
       }
 
       *tiling_flags |= AMDGPU_TILING_SET(SWIZZLE_MODE, surf->u.gfx9.surf.swizzle_mode);
       *tiling_flags |= AMDGPU_TILING_SET(DCC_OFFSET_256B, dcc_offset >> 8);
       *tiling_flags |= AMDGPU_TILING_SET(DCC_PITCH_MAX, surf->u.gfx9.display_dcc_pitch_max);
-      *tiling_flags |= AMDGPU_TILING_SET(DCC_INDEPENDENT_64B, surf->u.gfx9.dcc.independent_64B_blocks);
-      *tiling_flags |= AMDGPU_TILING_SET(DCC_INDEPENDENT_128B, surf->u.gfx9.dcc.independent_128B_blocks);
-      *tiling_flags |= AMDGPU_TILING_SET(DCC_MAX_COMPRESSED_BLOCK_SIZE, surf->u.gfx9.dcc.max_compressed_block_size);
+      *tiling_flags |=
+         AMDGPU_TILING_SET(DCC_INDEPENDENT_64B, surf->u.gfx9.dcc.independent_64B_blocks);
+      *tiling_flags |=
+         AMDGPU_TILING_SET(DCC_INDEPENDENT_128B, surf->u.gfx9.dcc.independent_128B_blocks);
+      *tiling_flags |= AMDGPU_TILING_SET(DCC_MAX_COMPRESSED_BLOCK_SIZE,
+                                         surf->u.gfx9.dcc.max_compressed_block_size);
       *tiling_flags |= AMDGPU_TILING_SET(SCANOUT, (surf->flags & RADEON_SURF_SCANOUT) != 0);
    } else {
       if (surf->u.legacy.level[0].mode >= RADEON_SURF_MODE_2D)
@@ -2389,9 +2292,10 @@ void ac_surface_get_bo_metadata(const struct radeon_info *info,
       *tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(surf->u.legacy.bankw));
       *tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(surf->u.legacy.bankh));
       if (surf->u.legacy.tile_split)
-         *tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(surf->u.legacy.tile_split));
+         *tiling_flags |=
+            AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(surf->u.legacy.tile_split));
       *tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(surf->u.legacy.mtilea));
-      *tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(surf->u.legacy.num_banks)-1);
+      *tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(surf->u.legacy.num_banks) - 1);
 
       if (surf->flags & RADEON_SURF_SCANOUT)
          *tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */
@@ -2406,12 +2310,9 @@ static uint32_t ac_get_umd_metadata_word1(const struct radeon_info *info)
 }
 
 /* This should be called after ac_compute_surface. */
-bool ac_surface_set_umd_metadata(const struct radeon_info *info,
-                                 struct radeon_surf *surf,
-                                 unsigned num_storage_samples,
-                                 unsigned num_mipmap_levels,
-                                 unsigned size_metadata,
-                                 uint32_t metadata[64])
+bool ac_surface_set_umd_metadata(const struct radeon_info *info, struct radeon_surf *surf,
+                                 unsigned num_storage_samples, unsigned num_mipmap_levels,
+                                 unsigned size_metadata, uint32_t metadata[64])
 {
    uint32_t *desc = &metadata[2];
    uint64_t offset;
@@ -2497,10 +2398,8 @@ bool ac_surface_set_umd_metadata(const struct radeon_info *info,
    return true;
 }
 
-void ac_surface_get_umd_metadata(const struct radeon_info *info,
-                                 struct radeon_surf *surf,
-                                 unsigned num_mipmap_levels,
-                                 uint32_t desc[8],
+void ac_surface_get_umd_metadata(const struct radeon_info *info, struct radeon_surf *surf,
+                                 unsigned num_mipmap_levels, uint32_t desc[8],
                                  unsigned *size_metadata, uint32_t metadata[64])
 {
    /* Clear the base address and set the relative DCC offset. */
@@ -2557,18 +2456,15 @@ void ac_surface_get_umd_metadata(const struct radeon_info *info,
    }
 }
 
-void ac_surface_override_offset_stride(const struct radeon_info *info,
-                                       struct radeon_surf *surf,
-                                       unsigned num_mipmap_levels,
-                                       uint64_t offset, unsigned pitch)
+void ac_surface_override_offset_stride(const struct radeon_info *info, struct radeon_surf *surf,
+                                       unsigned num_mipmap_levels, uint64_t offset, unsigned pitch)
 {
    if (info->chip_class >= GFX9) {
       if (pitch) {
          surf->u.gfx9.surf_pitch = pitch;
          if (num_mipmap_levels == 1)
             surf->u.gfx9.surf.epitch = pitch - 1;
-         surf->u.gfx9.surf_slice_size =
-               (uint64_t)pitch * surf->u.gfx9.surf_height * surf->bpe;
+         surf->u.gfx9.surf_slice_size = (uint64_t)pitch * surf->u.gfx9.surf_height * surf->bpe;
       }
       surf->u.gfx9.surf_offset = offset;
       if (surf->u.gfx9.stencil_offset)
@@ -2577,7 +2473,7 @@ void ac_surface_override_offset_stride(const struct radeon_info *info,
       if (pitch) {
          surf->u.legacy.level[0].nblk_x = pitch;
          surf->u.legacy.level[0].slice_size_dw =
-               ((uint64_t)pitch * surf->u.legacy.level[0].nblk_y * surf->bpe) / 4;
+            ((uint64_t)pitch * surf->u.legacy.level[0].nblk_y * surf->bpe) / 4;
       }
 
       if (offset) {
index 5dce25365becc2d6564a838295af4676bbe97e14..5605ba7a3a6db238bb2e951c837b58757f068129 100644 (file)
 #ifndef AC_SURFACE_H
 #define AC_SURFACE_H
 
-#include <stdint.h>
-#include <stdbool.h>
-
 #include "amd_family.h"
 
+#include <stdbool.h>
+#include <stdint.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -41,280 +41,274 @@ struct ac_addrlib;
 struct amdgpu_gpu_info;
 struct radeon_info;
 
-#define RADEON_SURF_MAX_LEVELS                  15
+#define RADEON_SURF_MAX_LEVELS 15
 
-enum radeon_surf_mode {
-    RADEON_SURF_MODE_LINEAR_ALIGNED = 1,
-    RADEON_SURF_MODE_1D = 2,
-    RADEON_SURF_MODE_2D = 3,
+enum radeon_surf_mode
+{
+   RADEON_SURF_MODE_LINEAR_ALIGNED = 1,
+   RADEON_SURF_MODE_1D = 2,
+   RADEON_SURF_MODE_2D = 3,
 };
 
 /* This describes D/S/Z/R swizzle modes.
  * Defined in the GB_TILE_MODEn.MICRO_TILE_MODE_NEW order.
  */
-enum radeon_micro_mode {
-    RADEON_MICRO_MODE_DISPLAY = 0,
-    RADEON_MICRO_MODE_STANDARD = 1,
-    RADEON_MICRO_MODE_DEPTH = 2,
-    RADEON_MICRO_MODE_RENDER = 3, /* gfx9 and older: rotated */
+enum radeon_micro_mode
+{
+   RADEON_MICRO_MODE_DISPLAY = 0,
+   RADEON_MICRO_MODE_STANDARD = 1,
+   RADEON_MICRO_MODE_DEPTH = 2,
+   RADEON_MICRO_MODE_RENDER = 3, /* gfx9 and older: rotated */
 };
 
 /* the first 16 bits are reserved for libdrm_radeon, don't use them */
-#define RADEON_SURF_SCANOUT                     (1 << 16)
-#define RADEON_SURF_ZBUFFER                     (1 << 17)
-#define RADEON_SURF_SBUFFER                     (1 << 18)
-#define RADEON_SURF_Z_OR_SBUFFER                (RADEON_SURF_ZBUFFER | RADEON_SURF_SBUFFER)
+#define RADEON_SURF_SCANOUT      (1 << 16)
+#define RADEON_SURF_ZBUFFER      (1 << 17)
+#define RADEON_SURF_SBUFFER      (1 << 18)
+#define RADEON_SURF_Z_OR_SBUFFER (RADEON_SURF_ZBUFFER | RADEON_SURF_SBUFFER)
 /* bits 19 and 20 are reserved for libdrm_radeon, don't use them */
-#define RADEON_SURF_FMASK                       (1 << 21)
-#define RADEON_SURF_DISABLE_DCC                 (1 << 22)
-#define RADEON_SURF_TC_COMPATIBLE_HTILE         (1 << 23)
-#define RADEON_SURF_IMPORTED                    (1 << 24)
-#define RADEON_SURF_CONTIGUOUS_DCC_LAYERS       (1 << 25)
-#define RADEON_SURF_SHAREABLE                   (1 << 26)
-#define RADEON_SURF_NO_RENDER_TARGET            (1 << 27)
+#define RADEON_SURF_FMASK                 (1 << 21)
+#define RADEON_SURF_DISABLE_DCC           (1 << 22)
+#define RADEON_SURF_TC_COMPATIBLE_HTILE   (1 << 23)
+#define RADEON_SURF_IMPORTED              (1 << 24)
+#define RADEON_SURF_CONTIGUOUS_DCC_LAYERS (1 << 25)
+#define RADEON_SURF_SHAREABLE             (1 << 26)
+#define RADEON_SURF_NO_RENDER_TARGET      (1 << 27)
 /* Force a swizzle mode (gfx9+) or tile mode (gfx6-8).
  * If this is not set, optimize for space. */
-#define RADEON_SURF_FORCE_SWIZZLE_MODE          (1 << 28)
-#define RADEON_SURF_NO_FMASK                    (1 << 29)
-#define RADEON_SURF_NO_HTILE                    (1 << 30)
-#define RADEON_SURF_FORCE_MICRO_TILE_MODE       (1u << 31)
+#define RADEON_SURF_FORCE_SWIZZLE_MODE    (1 << 28)
+#define RADEON_SURF_NO_FMASK              (1 << 29)
+#define RADEON_SURF_NO_HTILE              (1 << 30)
+#define RADEON_SURF_FORCE_MICRO_TILE_MODE (1u << 31)
 
 struct legacy_surf_level {
-    uint64_t                    offset;
-    uint32_t                    slice_size_dw; /* in dwords; max = 4GB / 4. */
-    uint32_t                    dcc_offset; /* relative offset within DCC mip tree */
-    uint32_t                    dcc_fast_clear_size;
-    uint32_t                    dcc_slice_fast_clear_size;
-    unsigned                    nblk_x:15;
-    unsigned                    nblk_y:15;
-    enum radeon_surf_mode       mode:2;
+   uint64_t offset;
+   uint32_t slice_size_dw; /* in dwords; max = 4GB / 4. */
+   uint32_t dcc_offset;    /* relative offset within DCC mip tree */
+   uint32_t dcc_fast_clear_size;
+   uint32_t dcc_slice_fast_clear_size;
+   unsigned nblk_x : 15;
+   unsigned nblk_y : 15;
+   enum radeon_surf_mode mode : 2;
 };
 
 struct legacy_surf_fmask {
-    unsigned slice_tile_max; /* max 4M */
-    uint8_t tiling_index;    /* max 31 */
-    uint8_t bankh;           /* max 8 */
-    uint16_t pitch_in_pixels;
-    uint64_t slice_size;
+   unsigned slice_tile_max; /* max 4M */
+   uint8_t tiling_index;    /* max 31 */
+   uint8_t bankh;           /* max 8 */
+   uint16_t pitch_in_pixels;
+   uint64_t slice_size;
 };
 
 struct legacy_surf_layout {
-    unsigned                    bankw:4;  /* max 8 */
-    unsigned                    bankh:4;  /* max 8 */
-    unsigned                    mtilea:4; /* max 8 */
-    unsigned                    tile_split:13;         /* max 4K */
-    unsigned                    stencil_tile_split:13; /* max 4K */
-    unsigned                    pipe_config:5;      /* max 17 */
-    unsigned                    num_banks:5;        /* max 16 */
-    unsigned                    macro_tile_index:4; /* max 15 */
-
-    /* Whether the depth miptree or stencil miptree as used by the DB are
-     * adjusted from their TC compatible form to ensure depth/stencil
-     * compatibility. If either is true, the corresponding plane cannot be
-     * sampled from.
-     */
-    unsigned                    depth_adjusted:1;
-    unsigned                    stencil_adjusted:1;
-
-    struct legacy_surf_level    level[RADEON_SURF_MAX_LEVELS];
-    struct legacy_surf_level    stencil_level[RADEON_SURF_MAX_LEVELS];
-    uint8_t                     tiling_index[RADEON_SURF_MAX_LEVELS];
-    uint8_t                     stencil_tiling_index[RADEON_SURF_MAX_LEVELS];
-    struct legacy_surf_fmask    fmask;
-    unsigned                    cmask_slice_tile_max;
+   unsigned bankw : 4;               /* max 8 */
+   unsigned bankh : 4;               /* max 8 */
+   unsigned mtilea : 4;              /* max 8 */
+   unsigned tile_split : 13;         /* max 4K */
+   unsigned stencil_tile_split : 13; /* max 4K */
+   unsigned pipe_config : 5;         /* max 17 */
+   unsigned num_banks : 5;           /* max 16 */
+   unsigned macro_tile_index : 4;    /* max 15 */
+
+   /* Whether the depth miptree or stencil miptree as used by the DB are
+    * adjusted from their TC compatible form to ensure depth/stencil
+    * compatibility. If either is true, the corresponding plane cannot be
+    * sampled from.
+    */
+   unsigned depth_adjusted : 1;
+   unsigned stencil_adjusted : 1;
+
+   struct legacy_surf_level level[RADEON_SURF_MAX_LEVELS];
+   struct legacy_surf_level stencil_level[RADEON_SURF_MAX_LEVELS];
+   uint8_t tiling_index[RADEON_SURF_MAX_LEVELS];
+   uint8_t stencil_tiling_index[RADEON_SURF_MAX_LEVELS];
+   struct legacy_surf_fmask fmask;
+   unsigned cmask_slice_tile_max;
 };
 
 /* Same as addrlib - AddrResourceType. */
-enum gfx9_resource_type {
-    RADEON_RESOURCE_1D = 0,
-    RADEON_RESOURCE_2D,
-    RADEON_RESOURCE_3D,
+enum gfx9_resource_type
+{
+   RADEON_RESOURCE_1D = 0,
+   RADEON_RESOURCE_2D,
+   RADEON_RESOURCE_3D,
 };
 
 struct gfx9_surf_flags {
-    uint16_t                    swizzle_mode; /* tile mode */
-    uint16_t                    epitch; /* (pitch - 1) or (height - 1) */
+   uint16_t swizzle_mode; /* tile mode */
+   uint16_t epitch;       /* (pitch - 1) or (height - 1) */
 };
 
 struct gfx9_surf_meta_flags {
-    unsigned                    rb_aligned:1;   /* optimal for RBs */
-    unsigned                    pipe_aligned:1; /* optimal for TC */
-    unsigned                    independent_64B_blocks:1;
-    unsigned                    independent_128B_blocks:1;
-    unsigned                    max_compressed_block_size:2;
+   unsigned rb_aligned : 1;   /* optimal for RBs */
+   unsigned pipe_aligned : 1; /* optimal for TC */
+   unsigned independent_64B_blocks : 1;
+   unsigned independent_128B_blocks : 1;
+   unsigned max_compressed_block_size : 2;
 };
 
 struct gfx9_surf_layout {
-    struct gfx9_surf_flags      surf;    /* color or depth surface */
-    struct gfx9_surf_flags      fmask;   /* not added to surf_size */
-    struct gfx9_surf_flags      stencil; /* added to surf_size, use stencil_offset */
-
-    struct gfx9_surf_meta_flags dcc;   /* metadata of color */
-
-    enum gfx9_resource_type     resource_type; /* 1D, 2D or 3D */
-    uint16_t                    surf_pitch; /* in blocks */
-    uint16_t                    surf_height;
-
-    uint64_t                    surf_offset; /* 0 unless imported with an offset */
-    /* The size of the 2D plane containing all mipmap levels. */
-    uint64_t                    surf_slice_size;
-    /* Mipmap level offset within the slice in bytes. Only valid for LINEAR. */
-    uint32_t                    offset[RADEON_SURF_MAX_LEVELS];
-    /* Mipmap level pitch in elements. Only valid for LINEAR. */
-    uint16_t                    pitch[RADEON_SURF_MAX_LEVELS];
-
-    uint64_t                    stencil_offset; /* separate stencil */
-
-    uint8_t                     dcc_block_width;
-    uint8_t                     dcc_block_height;
-    uint8_t                     dcc_block_depth;
-
-    /* Displayable DCC. This is always rb_aligned=0 and pipe_aligned=0.
-     * The 3D engine doesn't support that layout except for chips with 1 RB.
-     * All other chips must set rb_aligned=1.
-     * A compute shader needs to convert from aligned DCC to unaligned.
-     */
-    uint32_t                    display_dcc_size;
-    uint32_t                    display_dcc_alignment;
-    uint16_t                    display_dcc_pitch_max;  /* (mip chain pitch - 1) */
-    bool                        dcc_retile_use_uint16; /* if all values fit into uint16_t */
-    uint32_t                    dcc_retile_num_elements;
-    void                        *dcc_retile_map;
+   struct gfx9_surf_flags surf;    /* color or depth surface */
+   struct gfx9_surf_flags fmask;   /* not added to surf_size */
+   struct gfx9_surf_flags stencil; /* added to surf_size, use stencil_offset */
+
+   struct gfx9_surf_meta_flags dcc; /* metadata of color */
+
+   enum gfx9_resource_type resource_type; /* 1D, 2D or 3D */
+   uint16_t surf_pitch;                   /* in blocks */
+   uint16_t surf_height;
+
+   uint64_t surf_offset; /* 0 unless imported with an offset */
+   /* The size of the 2D plane containing all mipmap levels. */
+   uint64_t surf_slice_size;
+   /* Mipmap level offset within the slice in bytes. Only valid for LINEAR. */
+   uint32_t offset[RADEON_SURF_MAX_LEVELS];
+   /* Mipmap level pitch in elements. Only valid for LINEAR. */
+   uint16_t pitch[RADEON_SURF_MAX_LEVELS];
+
+   uint64_t stencil_offset; /* separate stencil */
+
+   uint8_t dcc_block_width;
+   uint8_t dcc_block_height;
+   uint8_t dcc_block_depth;
+
+   /* Displayable DCC. This is always rb_aligned=0 and pipe_aligned=0.
+    * The 3D engine doesn't support that layout except for chips with 1 RB.
+    * All other chips must set rb_aligned=1.
+    * A compute shader needs to convert from aligned DCC to unaligned.
+    */
+   uint32_t display_dcc_size;
+   uint32_t display_dcc_alignment;
+   uint16_t display_dcc_pitch_max; /* (mip chain pitch - 1) */
+   bool dcc_retile_use_uint16;     /* if all values fit into uint16_t */
+   uint32_t dcc_retile_num_elements;
+   void *dcc_retile_map;
 };
 
 struct radeon_surf {
-    /* Format properties. */
-    unsigned                    blk_w:4;
-    unsigned                    blk_h:4;
-    unsigned                    bpe:5;
-    /* Number of mipmap levels where DCC is enabled starting from level 0.
-     * Non-zero levels may be disabled due to alignment constraints, but not
-     * the first level.
-     */
-    unsigned                    num_dcc_levels:4;
-    unsigned                    is_linear:1;
-    unsigned                    has_stencil:1;
-    /* This might be true even if micro_tile_mode isn't displayable or rotated. */
-    unsigned                    is_displayable:1;
-    /* Displayable, thin, depth, rotated. AKA D,S,Z,R swizzle modes. */
-    unsigned                    micro_tile_mode:3;
-    uint32_t                    flags;
-
-    /* These are return values. Some of them can be set by the caller, but
-     * they will be treated as hints (e.g. bankw, bankh) and might be
-     * changed by the calculator.
-     */
-
-    /* Tile swizzle can be OR'd with low bits of the BASE_256B address.
-     * The value is the same for all mipmap levels. Supported tile modes:
-     * - GFX6: Only macro tiling.
-     * - GFX9: Only *_X and *_T swizzle modes. Level 0 must not be in the mip
-     *   tail.
-     *
-     * Only these surfaces are allowed to set it:
-     * - color (if it doesn't have to be displayable)
-     * - DCC (same tile swizzle as color)
-     * - FMASK
-     * - CMASK if it's TC-compatible or if the gen is GFX9
-     * - depth/stencil if HTILE is not TC-compatible and if the gen is not GFX9
-     */
-    uint8_t                     tile_swizzle;
-    uint8_t                     fmask_tile_swizzle;
-
-    uint64_t                    surf_size;
-    uint64_t                    fmask_size;
-    uint32_t                    surf_alignment;
-    uint32_t                    fmask_alignment;
-
-    /* DCC and HTILE are very small. */
-    uint32_t                    dcc_size;
-    uint32_t                    dcc_slice_size;
-    uint32_t                    dcc_alignment;
-
-    uint32_t                    htile_size;
-    uint32_t                    htile_slice_size;
-    uint32_t                    htile_alignment;
-
-    uint32_t                    cmask_size;
-    uint32_t                    cmask_slice_size;
-    uint32_t                    cmask_alignment;
-
-    /* All buffers combined. */
-    uint64_t                    htile_offset;
-    uint64_t                    fmask_offset;
-    uint64_t                    cmask_offset;
-    uint64_t                    dcc_offset;
-    uint64_t                    display_dcc_offset;
-    uint64_t                    dcc_retile_map_offset;
-    uint64_t                    total_size;
-    uint32_t                    alignment;
-
-    union {
-        /* Return values for GFX8 and older.
-         *
-         * Some of them can be set by the caller if certain parameters are
-         * desirable. The allocator will try to obey them.
-         */
-        struct legacy_surf_layout legacy;
-
-        /* GFX9+ return values. */
-        struct gfx9_surf_layout gfx9;
-    } u;
+   /* Format properties. */
+   unsigned blk_w : 4;
+   unsigned blk_h : 4;
+   unsigned bpe : 5;
+   /* Number of mipmap levels where DCC is enabled starting from level 0.
+    * Non-zero levels may be disabled due to alignment constraints, but not
+    * the first level.
+    */
+   unsigned num_dcc_levels : 4;
+   unsigned is_linear : 1;
+   unsigned has_stencil : 1;
+   /* This might be true even if micro_tile_mode isn't displayable or rotated. */
+   unsigned is_displayable : 1;
+   /* Displayable, thin, depth, rotated. AKA D,S,Z,R swizzle modes. */
+   unsigned micro_tile_mode : 3;
+   uint32_t flags;
+
+   /* These are return values. Some of them can be set by the caller, but
+    * they will be treated as hints (e.g. bankw, bankh) and might be
+    * changed by the calculator.
+    */
+
+   /* Tile swizzle can be OR'd with low bits of the BASE_256B address.
+    * The value is the same for all mipmap levels. Supported tile modes:
+    * - GFX6: Only macro tiling.
+    * - GFX9: Only *_X and *_T swizzle modes. Level 0 must not be in the mip
+    *   tail.
+    *
+    * Only these surfaces are allowed to set it:
+    * - color (if it doesn't have to be displayable)
+    * - DCC (same tile swizzle as color)
+    * - FMASK
+    * - CMASK if it's TC-compatible or if the gen is GFX9
+    * - depth/stencil if HTILE is not TC-compatible and if the gen is not GFX9
+    */
+   uint8_t tile_swizzle;
+   uint8_t fmask_tile_swizzle;
+
+   uint64_t surf_size;
+   uint64_t fmask_size;
+   uint32_t surf_alignment;
+   uint32_t fmask_alignment;
+
+   /* DCC and HTILE are very small. */
+   uint32_t dcc_size;
+   uint32_t dcc_slice_size;
+   uint32_t dcc_alignment;
+
+   uint32_t htile_size;
+   uint32_t htile_slice_size;
+   uint32_t htile_alignment;
+
+   uint32_t cmask_size;
+   uint32_t cmask_slice_size;
+   uint32_t cmask_alignment;
+
+   /* All buffers combined. */
+   uint64_t htile_offset;
+   uint64_t fmask_offset;
+   uint64_t cmask_offset;
+   uint64_t dcc_offset;
+   uint64_t display_dcc_offset;
+   uint64_t dcc_retile_map_offset;
+   uint64_t total_size;
+   uint32_t alignment;
+
+   union {
+      /* Return values for GFX8 and older.
+       *
+       * Some of them can be set by the caller if certain parameters are
+       * desirable. The allocator will try to obey them.
+       */
+      struct legacy_surf_layout legacy;
+
+      /* GFX9+ return values. */
+      struct gfx9_surf_layout gfx9;
+   } u;
 };
 
 struct ac_surf_info {
-       uint32_t width;
-       uint32_t height;
-       uint32_t depth;
-       uint8_t samples; /* For Z/S: samples; For color: FMASK coverage samples */
-       uint8_t storage_samples; /* For color: allocated samples */
-       uint8_t levels;
-       uint8_t num_channels; /* heuristic for displayability */
-       uint16_t array_size;
-       uint32_t *surf_index; /* Set a monotonic counter for tile swizzling. */
-       uint32_t *fmask_surf_index;
+   uint32_t width;
+   uint32_t height;
+   uint32_t depth;
+   uint8_t samples;         /* For Z/S: samples; For color: FMASK coverage samples */
+   uint8_t storage_samples; /* For color: allocated samples */
+   uint8_t levels;
+   uint8_t num_channels; /* heuristic for displayability */
+   uint16_t array_size;
+   uint32_t *surf_index; /* Set a monotonic counter for tile swizzling. */
+   uint32_t *fmask_surf_index;
 };
 
 struct ac_surf_config {
-       struct ac_surf_info info;
-       unsigned is_1d : 1;
-       unsigned is_3d : 1;
-       unsigned is_cube : 1;
+   struct ac_surf_info info;
+   unsigned is_1d : 1;
+   unsigned is_3d : 1;
+   unsigned is_cube : 1;
 };
 
 struct ac_addrlib *ac_addrlib_create(const struct radeon_info *info,
-                                    const struct amdgpu_gpu_info *amdinfo,
-                                    uint64_t *max_alignment);
+                                     const struct amdgpu_gpu_info *amdinfo,
+                                     uint64_t *max_alignment);
 void ac_addrlib_destroy(struct ac_addrlib *addrlib);
 
 int ac_compute_surface(struct ac_addrlib *addrlib, const struct radeon_info *info,
-                      const struct ac_surf_config * config,
-                      enum radeon_surf_mode mode,
-                      struct radeon_surf *surf);
+                       const struct ac_surf_config *config, enum radeon_surf_mode mode,
+                       struct radeon_surf *surf);
 void ac_surface_zero_dcc_fields(struct radeon_surf *surf);
 
-void ac_surface_set_bo_metadata(const struct radeon_info *info,
-                                struct radeon_surf *surf, uint64_t tiling_flags,
-                                enum radeon_surf_mode *mode);
-void ac_surface_get_bo_metadata(const struct radeon_info *info,
-                                struct radeon_surf *surf, uint64_t *tiling_flags);
-
-bool ac_surface_set_umd_metadata(const struct radeon_info *info,
-                                 struct radeon_surf *surf,
-                                 unsigned num_storage_samples,
-                                 unsigned num_mipmap_levels,
-                                 unsigned size_metadata,
-                                 uint32_t metadata[64]);
-void ac_surface_get_umd_metadata(const struct radeon_info *info,
-                                 struct radeon_surf *surf,
-                                 unsigned num_mipmap_levels,
-                                 uint32_t desc[8],
+void ac_surface_set_bo_metadata(const struct radeon_info *info, struct radeon_surf *surf,
+                                uint64_t tiling_flags, enum radeon_surf_mode *mode);
+void ac_surface_get_bo_metadata(const struct radeon_info *info, struct radeon_surf *surf,
+                                uint64_t *tiling_flags);
+
+bool ac_surface_set_umd_metadata(const struct radeon_info *info, struct radeon_surf *surf,
+                                 unsigned num_storage_samples, unsigned num_mipmap_levels,
+                                 unsigned size_metadata, uint32_t metadata[64]);
+void ac_surface_get_umd_metadata(const struct radeon_info *info, struct radeon_surf *surf,
+                                 unsigned num_mipmap_levels, uint32_t desc[8],
                                  unsigned *size_metadata, uint32_t metadata[64]);
 
-void ac_surface_override_offset_stride(const struct radeon_info *info,
-                                       struct radeon_surf *surf,
-                                       unsigned num_mipmap_levels,
-                                       uint64_t offset, unsigned pitch);
+void ac_surface_override_offset_stride(const struct radeon_info *info, struct radeon_surf *surf,
+                                       unsigned num_mipmap_levels, uint64_t offset, unsigned pitch);
 
 #ifdef __cplusplus
 }
index 485ae276306ad928c00c3254411185e7540338e5..475cb835b3c113c697e4c6a2108bffd2ba34d21f 100644 (file)
 #ifndef AMD_FAMILY_H
 #define AMD_FAMILY_H
 
-enum radeon_family {
-    CHIP_UNKNOWN = 0,
-    CHIP_R300,     /* R3xx-based cores. (GFX2) */
-    CHIP_R350,
-    CHIP_RV350,
-    CHIP_RV370,
-    CHIP_RV380,
-    CHIP_RS400,
-    CHIP_RC410,
-    CHIP_RS480,
-    CHIP_R420,     /* R4xx-based cores. (GFX2) */
-    CHIP_R423,
-    CHIP_R430,
-    CHIP_R480,
-    CHIP_R481,
-    CHIP_RV410,
-    CHIP_RS600,
-    CHIP_RS690,
-    CHIP_RS740,
-    CHIP_RV515,    /* R5xx-based cores. (GFX2) */
-    CHIP_R520,
-    CHIP_RV530,
-    CHIP_R580,
-    CHIP_RV560,
-    CHIP_RV570,
-    CHIP_R600,     /* GFX3 (R6xx) */
-    CHIP_RV610,
-    CHIP_RV630,
-    CHIP_RV670,
-    CHIP_RV620,
-    CHIP_RV635,
-    CHIP_RS780,
-    CHIP_RS880,
-    CHIP_RV770,    /* GFX3 (R7xx) */
-    CHIP_RV730,
-    CHIP_RV710,
-    CHIP_RV740,
-    CHIP_CEDAR,    /* GFX4 (Evergreen) */
-    CHIP_REDWOOD,
-    CHIP_JUNIPER,
-    CHIP_CYPRESS,
-    CHIP_HEMLOCK,
-    CHIP_PALM,
-    CHIP_SUMO,
-    CHIP_SUMO2,
-    CHIP_BARTS,
-    CHIP_TURKS,
-    CHIP_CAICOS,
-    CHIP_CAYMAN,   /* GFX5 (Northern Islands) */
-    CHIP_ARUBA,
-    CHIP_TAHITI,   /* GFX6 (Southern Islands) */
-    CHIP_PITCAIRN,
-    CHIP_VERDE,
-    CHIP_OLAND,
-    CHIP_HAINAN,
-    CHIP_BONAIRE,  /* GFX7 (Sea Islands) */
-    CHIP_KAVERI,
-    CHIP_KABINI,
-    CHIP_HAWAII,
-    CHIP_TONGA,    /* GFX8 (Volcanic Islands & Polaris) */
-    CHIP_ICELAND,
-    CHIP_CARRIZO,
-    CHIP_FIJI,
-    CHIP_STONEY,
-    CHIP_POLARIS10,
-    CHIP_POLARIS11,
-    CHIP_POLARIS12,
-    CHIP_VEGAM,
-    CHIP_VEGA10,   /* GFX9 (Vega) */
-    CHIP_VEGA12,
-    CHIP_VEGA20,
-    CHIP_RAVEN,
-    CHIP_RAVEN2,
-    CHIP_RENOIR,
-    CHIP_ARCTURUS,
-    CHIP_NAVI10,
-    CHIP_NAVI12,
-    CHIP_NAVI14,
-    CHIP_SIENNA_CICHLID,
-    CHIP_NAVY_FLOUNDER,
-    CHIP_LAST,
+enum radeon_family
+{
+   CHIP_UNKNOWN = 0,
+   CHIP_R300, /* R3xx-based cores. (GFX2) */
+   CHIP_R350,
+   CHIP_RV350,
+   CHIP_RV370,
+   CHIP_RV380,
+   CHIP_RS400,
+   CHIP_RC410,
+   CHIP_RS480,
+   CHIP_R420, /* R4xx-based cores. (GFX2) */
+   CHIP_R423,
+   CHIP_R430,
+   CHIP_R480,
+   CHIP_R481,
+   CHIP_RV410,
+   CHIP_RS600,
+   CHIP_RS690,
+   CHIP_RS740,
+   CHIP_RV515, /* R5xx-based cores. (GFX2) */
+   CHIP_R520,
+   CHIP_RV530,
+   CHIP_R580,
+   CHIP_RV560,
+   CHIP_RV570,
+   CHIP_R600, /* GFX3 (R6xx) */
+   CHIP_RV610,
+   CHIP_RV630,
+   CHIP_RV670,
+   CHIP_RV620,
+   CHIP_RV635,
+   CHIP_RS780,
+   CHIP_RS880,
+   CHIP_RV770, /* GFX3 (R7xx) */
+   CHIP_RV730,
+   CHIP_RV710,
+   CHIP_RV740,
+   CHIP_CEDAR, /* GFX4 (Evergreen) */
+   CHIP_REDWOOD,
+   CHIP_JUNIPER,
+   CHIP_CYPRESS,
+   CHIP_HEMLOCK,
+   CHIP_PALM,
+   CHIP_SUMO,
+   CHIP_SUMO2,
+   CHIP_BARTS,
+   CHIP_TURKS,
+   CHIP_CAICOS,
+   CHIP_CAYMAN, /* GFX5 (Northern Islands) */
+   CHIP_ARUBA,
+   CHIP_TAHITI, /* GFX6 (Southern Islands) */
+   CHIP_PITCAIRN,
+   CHIP_VERDE,
+   CHIP_OLAND,
+   CHIP_HAINAN,
+   CHIP_BONAIRE, /* GFX7 (Sea Islands) */
+   CHIP_KAVERI,
+   CHIP_KABINI,
+   CHIP_HAWAII,
+   CHIP_TONGA, /* GFX8 (Volcanic Islands & Polaris) */
+   CHIP_ICELAND,
+   CHIP_CARRIZO,
+   CHIP_FIJI,
+   CHIP_STONEY,
+   CHIP_POLARIS10,
+   CHIP_POLARIS11,
+   CHIP_POLARIS12,
+   CHIP_VEGAM,
+   CHIP_VEGA10, /* GFX9 (Vega) */
+   CHIP_VEGA12,
+   CHIP_VEGA20,
+   CHIP_RAVEN,
+   CHIP_RAVEN2,
+   CHIP_RENOIR,
+   CHIP_ARCTURUS,
+   CHIP_NAVI10,
+   CHIP_NAVI12,
+   CHIP_NAVI14,
+   CHIP_SIENNA_CICHLID,
+   CHIP_NAVY_FLOUNDER,
+   CHIP_LAST,
 };
 
-enum chip_class {
-    CLASS_UNKNOWN = 0,
-    R300,
-    R400,
-    R500,
-    R600,
-    R700,
-    EVERGREEN,
-    CAYMAN,
-    GFX6,
-    GFX7,
-    GFX8,
-    GFX9,
-    GFX10,
-    GFX10_3,
+enum chip_class
+{
+   CLASS_UNKNOWN = 0,
+   R300,
+   R400,
+   R500,
+   R600,
+   R700,
+   EVERGREEN,
+   CAYMAN,
+   GFX6,
+   GFX7,
+   GFX8,
+   GFX9,
+   GFX10,
+   GFX10_3,
 };
 
-enum ring_type {
-    RING_GFX = 0,
-    RING_COMPUTE,
-    RING_DMA,
-    RING_UVD,
-    RING_VCE,
-    RING_UVD_ENC,
-    RING_VCN_DEC,
-    RING_VCN_ENC,
-    RING_VCN_JPEG,
-    NUM_RING_TYPES,
+enum ring_type
+{
+   RING_GFX = 0,
+   RING_COMPUTE,
+   RING_DMA,
+   RING_UVD,
+   RING_VCE,
+   RING_UVD_ENC,
+   RING_VCN_DEC,
+   RING_VCN_ENC,
+   RING_VCN_JPEG,
+   NUM_RING_TYPES,
 };
 
 #endif
index f8e95085181c3855bd390b0d1fdd87ccb82c3213..7a5c2ea3b9c942ff6c59d596443c474895893c70 100644 (file)
 //---------------------------------------------------------------------------//
 
 // Sets val bits for specified mask in specified dst packed instance.
-#define AMD_HSA_BITS_SET(dst, mask, val)                                       \
-  dst &= (~(1 << mask ## _SHIFT) & ~mask);                                     \
-  dst |= (((val) << mask ## _SHIFT) & mask)
+#define AMD_HSA_BITS_SET(dst, mask, val)                                                           \
+   dst &= (~(1 << mask##_SHIFT) & ~mask);                                                          \
+   dst |= (((val) << mask##_SHIFT) & mask)
 
 // Gets bits for specified mask from specified src packed instance.
-#define AMD_HSA_BITS_GET(src, mask)                                            \
-  ((src & mask) >> mask ## _SHIFT)
+#define AMD_HSA_BITS_GET(src, mask) ((src & mask) >> mask##_SHIFT)
 
 /* Every amd_*_code_t has the following properties, which are composed of
  * a number of bit fields. Every bit field has a mask (AMD_CODE_PROPERTY_*),
  * implementation defined in the C standard and so cannot be used to
  * specify an ABI)
  */
-enum amd_code_property_mask_t {
-
-  /* Enable the setup of the SGPR user data registers
-   * (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t
-   * for initial register state.
-   *
-   * The total number of SGPRuser data registers requested must not
-   * exceed 16. Any requests beyond 16 will be ignored.
-   *
-   * Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of
-   * SGPR user data registers enabled up to 16).
-   */
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT,
-
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
-
-  AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10,
-  AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6,
-  AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED1_SHIFT,
-
-  /* Control wave ID base counter for GDS ordered-append. Used to set
-   * COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
-   * ORDERED_APPEND_MODE also needs to be settable)
-   */
-  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16,
-  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
-  AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
-
-  /* The interleave (swizzle) element size in bytes required by the
-   * code for private memory. This must be 2, 4, 8 or 16. This value
-   * is provided to the finalizer when it is invoked and is recorded
-   * here. The hardware will interleave the memory requests of each
-   * lane of a wavefront by this element size to ensure each
-   * work-item gets a distinct memory memory location. Therefore, the
-   * finalizer ensures that all load and store operations done to
-   * private memory do not exceed this size. For example, if the
-   * element size is 4 (32-bits or dword) and a 64-bit value must be
-   * loaded, the finalizer will generate two 32-bit loads. This
-   * ensures that the interleaving will get the work-item
-   * specific dword for both halves of the 64-bit value. If it just
-   * did a 64-bit load then it would get one dword which belonged to
-   * its own work-item, but the second dword would belong to the
-   * adjacent lane work-item since the interleaving is in dwords.
-   *
-   * The value used must match the value that the runtime configures
-   * the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This
-   * is generally DWORD.
-   *
-   * USE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM.
-   */
-  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17,
-  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
-  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
-
-  /* Are global memory addresses 64 bits. Must match
-   * amd_kernel_code_t.hsail_machine_model ==
-   * HSA_MACHINE_LARGE. Must also match
-   * SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
-   * SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
-   */
-  AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19,
-  AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
-  AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
-
-  /* Indicate if the generated ISA is using a dynamically sized call
-   * stack. This can happen if calls are implemented using a call
-   * stack and recursion, alloca or calls to indirect functions are
-   * present. In these cases the Finalizer cannot compute the total
-   * private segment size at compile time. In this case the
-   * workitem_private_segment_byte_size only specifies the statically
-   * know private segment size, and additional space must be added
-   * for the call stack.
-   */
-  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20,
-  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
-  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
-
-  /* Indicate if code generated has support for debugging. */
-  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21,
-  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
-  AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT,
-
-  AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22,
-  AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1,
-  AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT,
-
-  AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23,
-  AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9,
-  AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT
+enum amd_code_property_mask_t
+{
+
+   /* Enable the setup of the SGPR user data registers
+    * (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t
+    * for initial register state.
+    *
+    * The total number of SGPRuser data registers requested must not
+    * exceed 16. Any requests beyond 16 will be ignored.
+    *
+    * Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of
+    * SGPR user data registers enabled up to 16).
+    */
+
+   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER =
+      ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1)
+      << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT,
+
+   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR =
+      ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1)
+      << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT,
+
+   AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR =
+      ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1)
+      << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT,
+
+   AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR =
+      ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1)
+      << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
+
+   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID =
+      ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1)
+      << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT,
+
+   AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT =
+      ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1)
+      << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT,
+
+   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE =
+      ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1)
+      << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT,
+
+   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X =
+      ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1)
+      << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT,
+
+   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y =
+      ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1)
+      << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT,
+
+   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1,
+   AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z =
+      ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1)
+      << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT,
+
+   AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10,
+   AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6,
+   AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1)
+                                 << AMD_CODE_PROPERTY_RESERVED1_SHIFT,
+
+   /* Control wave ID base counter for GDS ordered-append. Used to set
+    * COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if
+    * ORDERED_APPEND_MODE also needs to be settable)
+    */
+   AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16,
+   AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1,
+   AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS =
+      ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1)
+      << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
+
+   /* The interleave (swizzle) element size in bytes required by the
+    * code for private memory. This must be 2, 4, 8 or 16. This value
+    * is provided to the finalizer when it is invoked and is recorded
+    * here. The hardware will interleave the memory requests of each
+    * lane of a wavefront by this element size to ensure each
+    * work-item gets a distinct memory memory location. Therefore, the
+    * finalizer ensures that all load and store operations done to
+    * private memory do not exceed this size. For example, if the
+    * element size is 4 (32-bits or dword) and a 64-bit value must be
+    * loaded, the finalizer will generate two 32-bit loads. This
+    * ensures that the interleaving will get the work-item
+    * specific dword for both halves of the 64-bit value. If it just
+    * did a 64-bit load then it would get one dword which belonged to
+    * its own work-item, but the second dword would belong to the
+    * adjacent lane work-item since the interleaving is in dwords.
+    *
+    * The value used must match the value that the runtime configures
+    * the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This
+    * is generally DWORD.
+    *
+    * USE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM.
+    */
+   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17,
+   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2,
+   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE =
+      ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1)
+      << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT,
+
+   /* Are global memory addresses 64 bits. Must match
+    * amd_kernel_code_t.hsail_machine_model ==
+    * HSA_MACHINE_LARGE. Must also match
+    * SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)),
+    * SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).
+    */
+   AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19,
+   AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1,
+   AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1)
+                                << AMD_CODE_PROPERTY_IS_PTR64_SHIFT,
+
+   /* Indicate if the generated ISA is using a dynamically sized call
+    * stack. This can happen if calls are implemented using a call
+    * stack and recursion, alloca or calls to indirect functions are
+    * present. In these cases the Finalizer cannot compute the total
+    * private segment size at compile time. In this case the
+    * workitem_private_segment_byte_size only specifies the statically
+    * know private segment size, and additional space must be added
+    * for the call stack.
+    */
+   AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20,
+   AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
+   AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK =
+      ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1)
+      << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT,
+
+   /* Indicate if code generated has support for debugging. */
+   AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21,
+   AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1,
+   AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1)
+                                          << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT,
+
+   AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22,
+   AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1,
+   AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1)
+                                          << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT,
+
+   AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23,
+   AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9,
+   AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1)
+                                 << AMD_CODE_PROPERTY_RESERVED2_SHIFT
 };
 
 /* AMD Kernel Code Object (amd_kernel_code_t). GPU CP uses the AMD Kernel
@@ -381,154 +412,154 @@ enum amd_code_property_mask_t {
  */
 
 typedef struct amd_kernel_code_s {
-  uint32_t amd_kernel_code_version_major;
-  uint32_t amd_kernel_code_version_minor;
-  uint16_t amd_machine_kind;
-  uint16_t amd_machine_version_major;
-  uint16_t amd_machine_version_minor;
-  uint16_t amd_machine_version_stepping;
-
-  /* Byte offset (possibly negative) from start of amd_kernel_code_t
-   * object to kernel's entry point instruction. The actual code for
-   * the kernel is required to be 256 byte aligned to match hardware
-   * requirements (SQ cache line is 16). The code must be position
-   * independent code (PIC) for AMD devices to give runtime the
-   * option of copying code to discrete GPU memory or APU L2
-   * cache. The Finalizer should endeavour to allocate all kernel
-   * machine code in contiguous memory pages so that a device
-   * pre-fetcher will tend to only pre-fetch Kernel Code objects,
-   * improving cache performance.
-   */
-  int64_t kernel_code_entry_byte_offset;
-
-  /* Range of bytes to consider prefetching expressed as an offset
-   * and size. The offset is from the start (possibly negative) of
-   * amd_kernel_code_t object. Set both to 0 if no prefetch
-   * information is available.
-   */
-  int64_t kernel_code_prefetch_byte_offset;
-  uint64_t kernel_code_prefetch_byte_size;
-
-  /* Number of bytes of scratch backing memory required for full
-   * occupancy of target chip. This takes into account the number of
-   * bytes of scratch per work-item, the wavefront size, the maximum
-   * number of wavefronts per CU, and the number of CUs. This is an
-   * upper limit on scratch. If the grid being dispatched is small it
-   * may only need less than this. If the kernel uses no scratch, or
-   * the Finalizer has not computed this value, it must be 0.
-   */
-  uint64_t max_scratch_backing_memory_byte_size;
-
-  /* Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
-   * COMPUTE_PGM_RSRC2 registers.
-   */
-  uint64_t compute_pgm_resource_registers;
-
-  /* Code properties. See amd_code_property_mask_t for a full list of
-   * properties.
-   */
-  uint32_t code_properties;
-
-  /* The amount of memory required for the combined private, spill
-   * and arg segments for a work-item in bytes. If
-   * is_dynamic_callstack is 1 then additional space must be added to
-   * this value for the call stack.
-   */
-  uint32_t workitem_private_segment_byte_size;
-
-  /* The amount of group segment memory required by a work-group in
-   * bytes. This does not include any dynamically allocated group
-   * segment memory that may be added when the kernel is
-   * dispatched.
-   */
-  uint32_t workgroup_group_segment_byte_size;
-
-  /* Number of byte of GDS required by kernel dispatch. Must be 0 if
-   * not using GDS.
-   */
-  uint32_t gds_segment_byte_size;
-
-  /* The size in bytes of the kernarg segment that holds the values
-   * of the arguments to the kernel. This could be used by CP to
-   * prefetch the kernarg segment pointed to by the dispatch packet.
-   */
-  uint64_t kernarg_segment_byte_size;
-
-  /* Number of fbarrier's used in the kernel and all functions it
-   * calls. If the implementation uses group memory to allocate the
-   * fbarriers then that amount must already be included in the
-   * workgroup_group_segment_byte_size total.
-   */
-  uint32_t workgroup_fbarrier_count;
-
-  /* Number of scalar registers used by a wavefront. This includes
-   * the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size
-   * and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a
-   * trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS.
-   */
-  uint16_t wavefront_sgpr_count;
-
-  /* Number of vector registers used by each work-item. Used to set
-   * COMPUTE_PGM_RSRC1.VGPRS.
-   */
-  uint16_t workitem_vgpr_count;
-
-  /* If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the
-   * first fixed VGPR number reserved.
-   */
-  uint16_t reserved_vgpr_first;
-
-  /* The number of consecutive VGPRs reserved by the client. If
-   * is_debug_supported then this count includes VGPRs reserved
-   * for debugger use.
-   */
-  uint16_t reserved_vgpr_count;
-
-  /* If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the
-   * first fixed SGPR number reserved.
-   */
-  uint16_t reserved_sgpr_first;
-
-  /* The number of consecutive SGPRs reserved by the client. If
-   * is_debug_supported then this count includes SGPRs reserved
-   * for debugger use.
-   */
-  uint16_t reserved_sgpr_count;
-
-  /* If is_debug_supported is 0 then must be 0. Otherwise, this is the
-   * fixed SGPR number used to hold the wave scratch offset for the
-   * entire kernel execution, or uint16_t(-1) if the register is not
-   * used or not known.
-   */
-  uint16_t debug_wavefront_private_segment_offset_sgpr;
-
-  /* If is_debug_supported is 0 then must be 0. Otherwise, this is the
-   * fixed SGPR number of the first of 4 SGPRs used to hold the
-   * scratch V# used for the entire kernel execution, or uint16_t(-1)
-   * if the registers are not used or not known.
-   */
-  uint16_t debug_private_segment_buffer_sgpr;
-
-  /* The maximum byte alignment of variables used by the kernel in
-   * the specified memory segment. Expressed as a power of two. Must
-   * be at least HSA_POWERTWO_16.
-   */
-  uint8_t kernarg_segment_alignment;
-  uint8_t group_segment_alignment;
-  uint8_t private_segment_alignment;
-
-  /* Wavefront size expressed as a power of two. Must be a power of 2
-   * in range 1..64 inclusive. Used to support runtime query that
-   * obtains wavefront size, which may be used by application to
-   * allocated dynamic group memory and set the dispatch work-group
-   * size.
-   */
-  uint8_t wavefront_size;
-
-  int32_t call_convention;
-  uint8_t reserved3[12];
-  uint64_t runtime_loader_kernel_symbol;
-  uint64_t control_directives[16];
+   uint32_t amd_kernel_code_version_major;
+   uint32_t amd_kernel_code_version_minor;
+   uint16_t amd_machine_kind;
+   uint16_t amd_machine_version_major;
+   uint16_t amd_machine_version_minor;
+   uint16_t amd_machine_version_stepping;
+
+   /* Byte offset (possibly negative) from start of amd_kernel_code_t
+    * object to kernel's entry point instruction. The actual code for
+    * the kernel is required to be 256 byte aligned to match hardware
+    * requirements (SQ cache line is 16). The code must be position
+    * independent code (PIC) for AMD devices to give runtime the
+    * option of copying code to discrete GPU memory or APU L2
+    * cache. The Finalizer should endeavour to allocate all kernel
+    * machine code in contiguous memory pages so that a device
+    * pre-fetcher will tend to only pre-fetch Kernel Code objects,
+    * improving cache performance.
+    */
+   int64_t kernel_code_entry_byte_offset;
+
+   /* Range of bytes to consider prefetching expressed as an offset
+    * and size. The offset is from the start (possibly negative) of
+    * amd_kernel_code_t object. Set both to 0 if no prefetch
+    * information is available.
+    */
+   int64_t kernel_code_prefetch_byte_offset;
+   uint64_t kernel_code_prefetch_byte_size;
+
+   /* Number of bytes of scratch backing memory required for full
+    * occupancy of target chip. This takes into account the number of
+    * bytes of scratch per work-item, the wavefront size, the maximum
+    * number of wavefronts per CU, and the number of CUs. This is an
+    * upper limit on scratch. If the grid being dispatched is small it
+    * may only need less than this. If the kernel uses no scratch, or
+    * the Finalizer has not computed this value, it must be 0.
+    */
+   uint64_t max_scratch_backing_memory_byte_size;
+
+   /* Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and
+    * COMPUTE_PGM_RSRC2 registers.
+    */
+   uint64_t compute_pgm_resource_registers;
+
+   /* Code properties. See amd_code_property_mask_t for a full list of
+    * properties.
+    */
+   uint32_t code_properties;
+
+   /* The amount of memory required for the combined private, spill
+    * and arg segments for a work-item in bytes. If
+    * is_dynamic_callstack is 1 then additional space must be added to
+    * this value for the call stack.
+    */
+   uint32_t workitem_private_segment_byte_size;
+
+   /* The amount of group segment memory required by a work-group in
+    * bytes. This does not include any dynamically allocated group
+    * segment memory that may be added when the kernel is
+    * dispatched.
+    */
+   uint32_t workgroup_group_segment_byte_size;
+
+   /* Number of byte of GDS required by kernel dispatch. Must be 0 if
+    * not using GDS.
+    */
+   uint32_t gds_segment_byte_size;
+
+   /* The size in bytes of the kernarg segment that holds the values
+    * of the arguments to the kernel. This could be used by CP to
+    * prefetch the kernarg segment pointed to by the dispatch packet.
+    */
+   uint64_t kernarg_segment_byte_size;
+
+   /* Number of fbarrier's used in the kernel and all functions it
+    * calls. If the implementation uses group memory to allocate the
+    * fbarriers then that amount must already be included in the
+    * workgroup_group_segment_byte_size total.
+    */
+   uint32_t workgroup_fbarrier_count;
+
+   /* Number of scalar registers used by a wavefront. This includes
+    * the special SGPRs for VCC, Flat Scratch Base, Flat Scratch Size
+    * and XNACK (for GFX8 (VI)). It does not include the 16 SGPR added if a
+    * trap handler is enabled. Used to set COMPUTE_PGM_RSRC1.SGPRS.
+    */
+   uint16_t wavefront_sgpr_count;
+
+   /* Number of vector registers used by each work-item. Used to set
+    * COMPUTE_PGM_RSRC1.VGPRS.
+    */
+   uint16_t workitem_vgpr_count;
+
+   /* If reserved_vgpr_count is 0 then must be 0. Otherwise, this is the
+    * first fixed VGPR number reserved.
+    */
+   uint16_t reserved_vgpr_first;
+
+   /* The number of consecutive VGPRs reserved by the client. If
+    * is_debug_supported then this count includes VGPRs reserved
+    * for debugger use.
+    */
+   uint16_t reserved_vgpr_count;
+
+   /* If reserved_sgpr_count is 0 then must be 0. Otherwise, this is the
+    * first fixed SGPR number reserved.
+    */
+   uint16_t reserved_sgpr_first;
+
+   /* The number of consecutive SGPRs reserved by the client. If
+    * is_debug_supported then this count includes SGPRs reserved
+    * for debugger use.
+    */
+   uint16_t reserved_sgpr_count;
+
+   /* If is_debug_supported is 0 then must be 0. Otherwise, this is the
+    * fixed SGPR number used to hold the wave scratch offset for the
+    * entire kernel execution, or uint16_t(-1) if the register is not
+    * used or not known.
+    */
+   uint16_t debug_wavefront_private_segment_offset_sgpr;
+
+   /* If is_debug_supported is 0 then must be 0. Otherwise, this is the
+    * fixed SGPR number of the first of 4 SGPRs used to hold the
+    * scratch V# used for the entire kernel execution, or uint16_t(-1)
+    * if the registers are not used or not known.
+    */
+   uint16_t debug_private_segment_buffer_sgpr;
+
+   /* The maximum byte alignment of variables used by the kernel in
+    * the specified memory segment. Expressed as a power of two. Must
+    * be at least HSA_POWERTWO_16.
+    */
+   uint8_t kernarg_segment_alignment;
+   uint8_t group_segment_alignment;
+   uint8_t private_segment_alignment;
+
+   /* Wavefront size expressed as a power of two. Must be a power of 2
+    * in range 1..64 inclusive. Used to support runtime query that
+    * obtains wavefront size, which may be used by application to
+    * allocated dynamic group memory and set the dispatch work-group
+    * size.
+    */
+   uint8_t wavefront_size;
+
+   int32_t call_convention;
+   uint8_t reserved3[12];
+   uint64_t runtime_loader_kernel_symbol;
+   uint64_t control_directives[16];
 } amd_kernel_code_t;
 
 #endif // AMDKERNELCODET_H
index 2692c817f08f0c81bc213dc4615396c8ebf83868..9eff122119282f31b8bb05b06c78e619a9ba93cc 100644 (file)
 #ifndef GFX10_FORMAT_TABLE_H
 #define GFX10_FORMAT_TABLE_H
 
-#include <stdbool.h>
 #include "pipe/p_format.h"
 
+#include <stdbool.h>
+
 struct gfx10_format {
-    unsigned img_format:9;
+   unsigned img_format : 9;
 
-    /* Various formats are only supported with workarounds for vertex fetch,
-     * and some 32_32_32 formats are supported natively, but only for buffers
-     * (possibly with some image support, actually, but no filtering). */
-    bool buffers_only:1;
+   /* Various formats are only supported with workarounds for vertex fetch,
+    * and some 32_32_32 formats are supported natively, but only for buffers
+    * (possibly with some image support, actually, but no filtering). */
+   bool buffers_only : 1;
 };
 
 extern const struct gfx10_format gfx10_format_table[PIPE_FORMAT_COUNT];
index c4474383cbe786a50afda5297e2bca32266226ec..a55598ce9c9f392ecdb9918184ee322d51ea87bb 100644 (file)
 #include "amdgfxregs.h"
 
 /* si values */
-#define SI_CONFIG_REG_OFFSET                 0x00008000
-#define SI_CONFIG_REG_END                    0x0000B000
-#define SI_SH_REG_OFFSET                     0x0000B000
-#define SI_SH_REG_END                        0x0000C000
-#define SI_CONTEXT_REG_OFFSET                0x00028000
-#define SI_CONTEXT_REG_END                   0x00030000
-#define CIK_UCONFIG_REG_OFFSET               0x00030000
-#define CIK_UCONFIG_REG_END                  0x00040000
-#define SI_UCONFIG_PERF_REG_OFFSET           0x00034000
-#define SI_UCONFIG_PERF_REG_END              0x00038000
+#define SI_CONFIG_REG_OFFSET       0x00008000
+#define SI_CONFIG_REG_END          0x0000B000
+#define SI_SH_REG_OFFSET           0x0000B000
+#define SI_SH_REG_END              0x0000C000
+#define SI_CONTEXT_REG_OFFSET      0x00028000
+#define SI_CONTEXT_REG_END         0x00030000
+#define CIK_UCONFIG_REG_OFFSET     0x00030000
+#define CIK_UCONFIG_REG_END        0x00040000
+#define SI_UCONFIG_PERF_REG_OFFSET 0x00034000
+#define SI_UCONFIG_PERF_REG_END    0x00038000
 
 /* For register shadowing: */
-#define SI_SH_REG_SPACE_SIZE                   (SI_SH_REG_END - SI_SH_REG_OFFSET)
-#define SI_CONTEXT_REG_SPACE_SIZE              (SI_CONTEXT_REG_END - SI_CONTEXT_REG_OFFSET)
-#define SI_UCONFIG_REG_SPACE_SIZE              (CIK_UCONFIG_REG_END - CIK_UCONFIG_REG_OFFSET)
-#define SI_UCONFIG_PERF_REG_SPACE_SIZE          (SI_UCONFIG_PERF_REG_END - SI_UCONFIG_PERF_REG_OFFSET)
+#define SI_SH_REG_SPACE_SIZE           (SI_SH_REG_END - SI_SH_REG_OFFSET)
+#define SI_CONTEXT_REG_SPACE_SIZE      (SI_CONTEXT_REG_END - SI_CONTEXT_REG_OFFSET)
+#define SI_UCONFIG_REG_SPACE_SIZE      (CIK_UCONFIG_REG_END - CIK_UCONFIG_REG_OFFSET)
+#define SI_UCONFIG_PERF_REG_SPACE_SIZE (SI_UCONFIG_PERF_REG_END - SI_UCONFIG_PERF_REG_OFFSET)
 
-#define SI_SHADOWED_SH_REG_OFFSET              0
-#define SI_SHADOWED_CONTEXT_REG_OFFSET         SI_SH_REG_SPACE_SIZE
-#define SI_SHADOWED_UCONFIG_REG_OFFSET         (SI_SH_REG_SPACE_SIZE + SI_CONTEXT_REG_SPACE_SIZE)
-#define SI_SHADOWED_REG_BUFFER_SIZE            (SI_SH_REG_SPACE_SIZE + SI_CONTEXT_REG_SPACE_SIZE + \
-                                                SI_UCONFIG_REG_SPACE_SIZE)
+#define SI_SHADOWED_SH_REG_OFFSET      0
+#define SI_SHADOWED_CONTEXT_REG_OFFSET SI_SH_REG_SPACE_SIZE
+#define SI_SHADOWED_UCONFIG_REG_OFFSET (SI_SH_REG_SPACE_SIZE + SI_CONTEXT_REG_SPACE_SIZE)
+#define SI_SHADOWED_REG_BUFFER_SIZE                                                                \
+   (SI_SH_REG_SPACE_SIZE + SI_CONTEXT_REG_SPACE_SIZE + SI_UCONFIG_REG_SPACE_SIZE)
 
 #define EVENT_TYPE_CACHE_FLUSH                  0x6
-#define EVENT_TYPE_PS_PARTIAL_FLUSH            0x10
+#define EVENT_TYPE_PS_PARTIAL_FLUSH             0x10
 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14
-#define EVENT_TYPE_ZPASS_DONE                  0x15
-#define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT   0x16
-#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH       0x1f
-#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS       0x20
-#define                EVENT_TYPE(x)                           ((x) << 0)
-#define                EVENT_INDEX(x)                          ((x) << 8)
-                /* 0 - any non-TS event
               * 1 - ZPASS_DONE
               * 2 - SAMPLE_PIPELINESTAT
               * 3 - SAMPLE_STREAMOUTSTAT*
               * 4 - *S_PARTIAL_FLUSH
               * 5 - TS events
               */
+#define EVENT_TYPE_ZPASS_DONE                   0x15
+#define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT    0x16
+#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH        0x1f
+#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS        0x20
+#define EVENT_TYPE(x)                           ((x) << 0)
+#define EVENT_INDEX(x)                          ((x) << 8)
+/* 0 - any non-TS event
+ * 1 - ZPASS_DONE
+ * 2 - SAMPLE_PIPELINESTAT
+ * 3 - SAMPLE_STREAMOUTSTAT*
+ * 4 - *S_PARTIAL_FLUSH
+ * 5 - TS events
+ */
 
 /* EVENT_WRITE_EOP (SI-VI) & RELEASE_MEM (GFX9) */
-#define EVENT_TCL1_VOL_ACTION_ENA              (1 << 12)
-#define EVENT_TC_VOL_ACTION_ENA                        (1 << 13)
-#define EVENT_TC_WB_ACTION_ENA                 (1 << 15)
-#define EVENT_TCL1_ACTION_ENA                  (1 << 16)
-#define EVENT_TC_ACTION_ENA                    (1 << 17)
-#define EVENT_TC_NC_ACTION_ENA                 (1 << 19) /* GFX9+ */
-#define EVENT_TC_WC_ACTION_ENA                 (1 << 20) /* GFX9+ */
-#define EVENT_TC_MD_ACTION_ENA                 (1 << 21) /* GFX9+ */
-
+#define EVENT_TCL1_VOL_ACTION_ENA (1 << 12)
+#define EVENT_TC_VOL_ACTION_ENA   (1 << 13)
+#define EVENT_TC_WB_ACTION_ENA    (1 << 15)
+#define EVENT_TCL1_ACTION_ENA     (1 << 16)
+#define EVENT_TC_ACTION_ENA       (1 << 17)
+#define EVENT_TC_NC_ACTION_ENA    (1 << 19) /* GFX9+ */
+#define EVENT_TC_WC_ACTION_ENA    (1 << 20) /* GFX9+ */
+#define EVENT_TC_MD_ACTION_ENA    (1 << 21) /* GFX9+ */
 
-#define PREDICATION_OP_CLEAR 0x0
-#define PREDICATION_OP_ZPASS 0x1
+#define PREDICATION_OP_CLEAR     0x0
+#define PREDICATION_OP_ZPASS     0x1
 #define PREDICATION_OP_PRIMCOUNT 0x2
-#define PREDICATION_OP_BOOL64 0x3
+#define PREDICATION_OP_BOOL64    0x3
 
 #define PRED_OP(x) ((x) << 16)
 
 #define PREDICATION_CONTINUE (1 << 31)
 
-#define PREDICATION_HINT_WAIT (0 << 12)
+#define PREDICATION_HINT_WAIT        (0 << 12)
 #define PREDICATION_HINT_NOWAIT_DRAW (1 << 12)
 
 #define PREDICATION_DRAW_NOT_VISIBLE (0 << 8)
-#define PREDICATION_DRAW_VISIBLE (1 << 8)
+#define PREDICATION_DRAW_VISIBLE     (1 << 8)
 
-#define R600_TEXEL_PITCH_ALIGNMENT_MASK        0x7
+#define R600_TEXEL_PITCH_ALIGNMENT_MASK 0x7
 
 /* All registers defined in this packet section don't exist and the only
  * purpose of these definitions is to define packet encoding that
  * the IB parser understands, and also to have an accurate documentation.
  */
-#define PKT3_NOP                               0x10
-#define PKT3_SET_BASE                          0x11
-#define PKT3_CLEAR_STATE                       0x12
-#define PKT3_INDEX_BUFFER_SIZE                 0x13
-#define PKT3_DISPATCH_DIRECT                   0x15
-#define PKT3_DISPATCH_INDIRECT                 0x16
-#define PKT3_OCCLUSION_QUERY                   0x1F /* new for CIK */
-#define PKT3_SET_PREDICATION                   0x20
-#define PKT3_COND_EXEC                         0x22
-#define PKT3_PRED_EXEC                         0x23
-#define PKT3_DRAW_INDIRECT                     0x24
-#define PKT3_DRAW_INDEX_INDIRECT               0x25
-#define PKT3_INDEX_BASE                        0x26
-#define PKT3_DRAW_INDEX_2                      0x27
-#define PKT3_CONTEXT_CONTROL                   0x28
-#define     CC0_LOAD_GLOBAL_CONFIG(x)          (((unsigned)(x) & 0x1) << 0)
-#define     CC0_LOAD_PER_CONTEXT_STATE(x)      (((unsigned)(x) & 0x1) << 1)
-#define     CC0_LOAD_GLOBAL_UCONFIG(x)         (((unsigned)(x) & 0x1) << 15)
-#define     CC0_LOAD_GFX_SH_REGS(x)            (((unsigned)(x) & 0x1) << 16)
-#define     CC0_LOAD_CS_SH_REGS(x)             (((unsigned)(x) & 0x1) << 24)
-#define     CC0_LOAD_CE_RAM(x)                 (((unsigned)(x) & 0x1) << 28)
-#define     CC0_UPDATE_LOAD_ENABLES(x)         (((unsigned)(x) & 0x1) << 31)
-#define     CC1_SHADOW_GLOBAL_CONFIG(x)        (((unsigned)(x) & 0x1) << 0)
-#define     CC1_SHADOW_PER_CONTEXT_STATE(x)    (((unsigned)(x) & 0x1) << 1)
-#define     CC1_SHADOW_GLOBAL_UCONFIG(x)       (((unsigned)(x) & 0x1) << 15)
-#define     CC1_SHADOW_GFX_SH_REGS(x)          (((unsigned)(x) & 0x1) << 16)
-#define     CC1_SHADOW_CS_SH_REGS(x)           (((unsigned)(x) & 0x1) << 24)
-#define     CC1_UPDATE_SHADOW_ENABLES(x)       (((unsigned)(x) & 0x1) << 31)
-#define PKT3_INDEX_TYPE                        0x2A /* not on GFX9 */
-#define PKT3_DRAW_INDIRECT_MULTI               0x2C
-#define   R_2C3_DRAW_INDEX_LOC                  0x2C3
-#define     S_2C3_COUNT_INDIRECT_ENABLE(x)      (((unsigned)(x) & 0x1) << 30)
-#define     S_2C3_DRAW_INDEX_ENABLE(x)          (((unsigned)(x) & 0x1) << 31)
-#define PKT3_DRAW_INDEX_AUTO                   0x2D
-#define PKT3_DRAW_INDEX_IMMD                   0x2E /* not on CIK */
-#define PKT3_NUM_INSTANCES                     0x2F
-#define PKT3_DRAW_INDEX_MULTI_AUTO             0x30
-#define PKT3_INDIRECT_BUFFER_SI                0x32 /* not on CIK */
-#define PKT3_INDIRECT_BUFFER_CONST             0x33
-#define PKT3_STRMOUT_BUFFER_UPDATE             0x34
-#define                STRMOUT_STORE_BUFFER_FILLED_SIZE        1
-#define                STRMOUT_OFFSET_SOURCE(x)        (((unsigned)(x) & 0x3) << 1)
-#define                        STRMOUT_OFFSET_FROM_PACKET              0
-#define                        STRMOUT_OFFSET_FROM_VGT_FILLED_SIZE     1
-#define                        STRMOUT_OFFSET_FROM_MEM                 2
-#define                        STRMOUT_OFFSET_NONE                     3
-#define                STRMOUT_DATA_TYPE(x)            (((unsigned)(x) & 0x1) << 7)
-#define                STRMOUT_SELECT_BUFFER(x)        (((unsigned)(x) & 0x3) << 8)
-#define PKT3_DRAW_INDEX_OFFSET_2               0x35
-#define PKT3_WRITE_DATA                        0x37
-#define PKT3_DRAW_INDEX_INDIRECT_MULTI         0x38
-#define PKT3_MEM_SEMAPHORE                     0x39
-#define PKT3_MPEG_INDEX                        0x3A /* not on CIK */
-#define PKT3_WAIT_REG_MEM                      0x3C
-#define                WAIT_REG_MEM_EQUAL              3
-#define                WAIT_REG_MEM_NOT_EQUAL          4
-#define                WAIT_REG_MEM_GREATER_OR_EQUAL   5
-#define         WAIT_REG_MEM_MEM_SPACE(x)       (((unsigned)(x) & 0x3) << 4)
-#define         WAIT_REG_MEM_PFP               (1 << 8)
-#define PKT3_MEM_WRITE                         0x3D /* not on CIK */
-#define PKT3_INDIRECT_BUFFER_CIK               0x3F /* new on CIK */
+#define PKT3_NOP                            0x10
+#define PKT3_SET_BASE                       0x11
+#define PKT3_CLEAR_STATE                    0x12
+#define PKT3_INDEX_BUFFER_SIZE              0x13
+#define PKT3_DISPATCH_DIRECT                0x15
+#define PKT3_DISPATCH_INDIRECT              0x16
+#define PKT3_OCCLUSION_QUERY                0x1F /* new for CIK */
+#define PKT3_SET_PREDICATION                0x20
+#define PKT3_COND_EXEC                      0x22
+#define PKT3_PRED_EXEC                      0x23
+#define PKT3_DRAW_INDIRECT                  0x24
+#define PKT3_DRAW_INDEX_INDIRECT            0x25
+#define PKT3_INDEX_BASE                     0x26
+#define PKT3_DRAW_INDEX_2                   0x27
+#define PKT3_CONTEXT_CONTROL                0x28
+#define CC0_LOAD_GLOBAL_CONFIG(x)           (((unsigned)(x)&0x1) << 0)
+#define CC0_LOAD_PER_CONTEXT_STATE(x)       (((unsigned)(x)&0x1) << 1)
+#define CC0_LOAD_GLOBAL_UCONFIG(x)          (((unsigned)(x)&0x1) << 15)
+#define CC0_LOAD_GFX_SH_REGS(x)             (((unsigned)(x)&0x1) << 16)
+#define CC0_LOAD_CS_SH_REGS(x)              (((unsigned)(x)&0x1) << 24)
+#define CC0_LOAD_CE_RAM(x)                  (((unsigned)(x)&0x1) << 28)
+#define CC0_UPDATE_LOAD_ENABLES(x)          (((unsigned)(x)&0x1) << 31)
+#define CC1_SHADOW_GLOBAL_CONFIG(x)         (((unsigned)(x)&0x1) << 0)
+#define CC1_SHADOW_PER_CONTEXT_STATE(x)     (((unsigned)(x)&0x1) << 1)
+#define CC1_SHADOW_GLOBAL_UCONFIG(x)        (((unsigned)(x)&0x1) << 15)
+#define CC1_SHADOW_GFX_SH_REGS(x)           (((unsigned)(x)&0x1) << 16)
+#define CC1_SHADOW_CS_SH_REGS(x)            (((unsigned)(x)&0x1) << 24)
+#define CC1_UPDATE_SHADOW_ENABLES(x)        (((unsigned)(x)&0x1) << 31)
+#define PKT3_INDEX_TYPE                     0x2A /* not on GFX9 */
+#define PKT3_DRAW_INDIRECT_MULTI            0x2C
+#define R_2C3_DRAW_INDEX_LOC                0x2C3
+#define S_2C3_COUNT_INDIRECT_ENABLE(x)      (((unsigned)(x)&0x1) << 30)
+#define S_2C3_DRAW_INDEX_ENABLE(x)          (((unsigned)(x)&0x1) << 31)
+#define PKT3_DRAW_INDEX_AUTO                0x2D
+#define PKT3_DRAW_INDEX_IMMD                0x2E /* not on CIK */
+#define PKT3_NUM_INSTANCES                  0x2F
+#define PKT3_DRAW_INDEX_MULTI_AUTO          0x30
+#define PKT3_INDIRECT_BUFFER_SI             0x32 /* not on CIK */
+#define PKT3_INDIRECT_BUFFER_CONST          0x33
+#define PKT3_STRMOUT_BUFFER_UPDATE          0x34
+#define STRMOUT_STORE_BUFFER_FILLED_SIZE    1
+#define STRMOUT_OFFSET_SOURCE(x)            (((unsigned)(x)&0x3) << 1)
+#define STRMOUT_OFFSET_FROM_PACKET          0
+#define STRMOUT_OFFSET_FROM_VGT_FILLED_SIZE 1
+#define STRMOUT_OFFSET_FROM_MEM             2
+#define STRMOUT_OFFSET_NONE                 3
+#define STRMOUT_DATA_TYPE(x)                (((unsigned)(x)&0x1) << 7)
+#define STRMOUT_SELECT_BUFFER(x)            (((unsigned)(x)&0x3) << 8)
+#define PKT3_DRAW_INDEX_OFFSET_2            0x35
+#define PKT3_WRITE_DATA                     0x37
+#define PKT3_DRAW_INDEX_INDIRECT_MULTI      0x38
+#define PKT3_MEM_SEMAPHORE                  0x39
+#define PKT3_MPEG_INDEX                     0x3A /* not on CIK */
+#define PKT3_WAIT_REG_MEM                   0x3C
+#define WAIT_REG_MEM_EQUAL                  3
+#define WAIT_REG_MEM_NOT_EQUAL              4
+#define WAIT_REG_MEM_GREATER_OR_EQUAL       5
+#define WAIT_REG_MEM_MEM_SPACE(x)           (((unsigned)(x)&0x3) << 4)
+#define WAIT_REG_MEM_PFP                    (1 << 8)
+#define PKT3_MEM_WRITE                      0x3D /* not on CIK */
+#define PKT3_INDIRECT_BUFFER_CIK            0x3F /* new on CIK */
 
-#define PKT3_COPY_DATA                        0x40
-#define                COPY_DATA_SRC_SEL(x)            ((x) & 0xf)
-#define                        COPY_DATA_REG           0
-#define                        COPY_DATA_SRC_MEM       1 /* only valid as source */
-#define                 COPY_DATA_TC_L2         2
-#define                 COPY_DATA_GDS           3
-#define                 COPY_DATA_PERF          4
-#define                 COPY_DATA_IMM           5
-#define                 COPY_DATA_TIMESTAMP     9
-#define                COPY_DATA_DST_SEL(x)            (((unsigned)(x) & 0xf) << 8)
-#define                 COPY_DATA_DST_MEM_GRBM 1 /* sync across GRBM, deprecated */
-#define                 COPY_DATA_TC_L2         2
-#define                 COPY_DATA_GDS           3
-#define                 COPY_DATA_PERF          4
-#define                 COPY_DATA_DST_MEM       5
-#define                COPY_DATA_COUNT_SEL             (1 << 16)
-#define                COPY_DATA_WR_CONFIRM            (1 << 20)
-#define                COPY_DATA_ENGINE_PFP            (1 << 30)
-#define PKT3_PFP_SYNC_ME                      0x42
+#define PKT3_COPY_DATA                         0x40
+#define COPY_DATA_SRC_SEL(x)                   ((x)&0xf)
+#define COPY_DATA_REG                          0
+#define COPY_DATA_SRC_MEM                      1 /* only valid as source */
+#define COPY_DATA_TC_L2                        2
+#define COPY_DATA_GDS                          3
+#define COPY_DATA_PERF                         4
+#define COPY_DATA_IMM                          5
+#define COPY_DATA_TIMESTAMP                    9
+#define COPY_DATA_DST_SEL(x)                   (((unsigned)(x)&0xf) << 8)
+#define COPY_DATA_DST_MEM_GRBM                 1 /* sync across GRBM, deprecated */
+#define COPY_DATA_TC_L2                        2
+#define COPY_DATA_GDS                          3
+#define COPY_DATA_PERF                         4
+#define COPY_DATA_DST_MEM                      5
+#define COPY_DATA_COUNT_SEL                    (1 << 16)
+#define COPY_DATA_WR_CONFIRM                   (1 << 20)
+#define COPY_DATA_ENGINE_PFP                   (1 << 30)
+#define PKT3_PFP_SYNC_ME                       0x42
 #define PKT3_SURFACE_SYNC                      0x43 /* deprecated on CIK, use ACQUIRE_MEM */
 #define PKT3_ME_INITIALIZE                     0x44 /* not on CIK */
 #define PKT3_COND_WRITE                        0x45
 #define PKT3_EVENT_WRITE                       0x46
 #define PKT3_EVENT_WRITE_EOP                   0x47 /* not on GFX9 */
-#define         EOP_DST_SEL(x)                         ((x) << 16)
-#define                        EOP_DST_SEL_MEM                 0
-#define                        EOP_DST_SEL_TC_L2               1
-#define         EOP_INT_SEL(x)                          ((x) << 24)
-#define                        EOP_INT_SEL_NONE                        0
-#define                        EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM  3
-#define         EOP_DATA_SEL(x)                         ((x) << 29)
-#define                        EOP_DATA_SEL_DISCARD            0
-#define                        EOP_DATA_SEL_VALUE_32BIT        1
-#define                        EOP_DATA_SEL_VALUE_64BIT        2
-#define                        EOP_DATA_SEL_TIMESTAMP          3
-#define                        EOP_DATA_SEL_GDS                5
-#define                EOP_DATA_GDS(dw_offset, num_dwords)     ((dw_offset) | ((unsigned)(num_dwords) << 16))
+#define EOP_DST_SEL(x)                         ((x) << 16)
+#define EOP_DST_SEL_MEM                        0
+#define EOP_DST_SEL_TC_L2                      1
+#define EOP_INT_SEL(x)                         ((x) << 24)
+#define EOP_INT_SEL_NONE                       0
+#define EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM 3
+#define EOP_DATA_SEL(x)                        ((x) << 29)
+#define EOP_DATA_SEL_DISCARD                   0
+#define EOP_DATA_SEL_VALUE_32BIT               1
+#define EOP_DATA_SEL_VALUE_64BIT               2
+#define EOP_DATA_SEL_TIMESTAMP                 3
+#define EOP_DATA_SEL_GDS                       5
+#define EOP_DATA_GDS(dw_offset, num_dwords)    ((dw_offset) | ((unsigned)(num_dwords) << 16))
 /* CP DMA bug: Any use of CP_DMA.DST_SEL=TC must be avoided when EOS packets
  * are used. Use DST_SEL=MC instead. For prefetch, use SRC_SEL=TC and
  * DST_SEL=MC. Only CIK chips are affected.
  */
 /* fix CP DMA before uncommenting: */
 /*#define PKT3_EVENT_WRITE_EOS                   0x48*/ /* not on GFX9 */
-#define PKT3_RELEASE_MEM                       0x49 /* GFX9+ [any ring] or GFX8 [compute ring only] */
-#define PKT3_CONTEXT_REG_RMW                   0x51 /* older firmware versions on older chips don't have this */
-#define PKT3_ONE_REG_WRITE                     0x57 /* not on CIK */
-#define PKT3_ACQUIRE_MEM                       0x58 /* new for CIK */
-#define PKT3_REWIND                            0x59 /* VI+ [any ring] or CIK [compute ring only] */
-#define PKT3_LOAD_UCONFIG_REG                  0x5E /* GFX7+ */
-#define PKT3_LOAD_SH_REG                       0x5F
-#define PKT3_LOAD_CONTEXT_REG                  0x61
-#define PKT3_SET_CONFIG_REG                    0x68
-#define PKT3_SET_CONTEXT_REG                   0x69
-#define PKT3_SET_SH_REG                        0x76
-#define PKT3_SET_SH_REG_OFFSET                 0x77
-#define PKT3_SET_UCONFIG_REG                   0x79 /* new for CIK */
-#define PKT3_SET_UCONFIG_REG_INDEX             0x7A /* new for GFX9, CP ucode version >= 26 */
-#define PKT3_LOAD_CONST_RAM                    0x80
-#define PKT3_WRITE_CONST_RAM                   0x81
-#define PKT3_DUMP_CONST_RAM                    0x83
-#define PKT3_INCREMENT_CE_COUNTER              0x84
-#define PKT3_INCREMENT_DE_COUNTER              0x85
-#define PKT3_WAIT_ON_CE_COUNTER                0x86
-#define PKT3_SET_SH_REG_INDEX                  0x9B
-#define PKT3_LOAD_CONTEXT_REG_INDEX            0x9F /* new for VI */
+#define PKT3_RELEASE_MEM            0x49 /* GFX9+ [any ring] or GFX8 [compute ring only] */
+#define PKT3_CONTEXT_REG_RMW        0x51 /* older firmware versions on older chips don't have this */
+#define PKT3_ONE_REG_WRITE          0x57 /* not on CIK */
+#define PKT3_ACQUIRE_MEM            0x58 /* new for CIK */
+#define PKT3_REWIND                 0x59 /* VI+ [any ring] or CIK [compute ring only] */
+#define PKT3_LOAD_UCONFIG_REG       0x5E /* GFX7+ */
+#define PKT3_LOAD_SH_REG            0x5F
+#define PKT3_LOAD_CONTEXT_REG       0x61
+#define PKT3_SET_CONFIG_REG         0x68
+#define PKT3_SET_CONTEXT_REG        0x69
+#define PKT3_SET_SH_REG             0x76
+#define PKT3_SET_SH_REG_OFFSET      0x77
+#define PKT3_SET_UCONFIG_REG        0x79 /* new for CIK */
+#define PKT3_SET_UCONFIG_REG_INDEX  0x7A /* new for GFX9, CP ucode version >= 26 */
+#define PKT3_LOAD_CONST_RAM         0x80
+#define PKT3_WRITE_CONST_RAM        0x81
+#define PKT3_DUMP_CONST_RAM         0x83
+#define PKT3_INCREMENT_CE_COUNTER   0x84
+#define PKT3_INCREMENT_DE_COUNTER   0x85
+#define PKT3_WAIT_ON_CE_COUNTER     0x86
+#define PKT3_SET_SH_REG_INDEX       0x9B
+#define PKT3_LOAD_CONTEXT_REG_INDEX 0x9F /* new for VI */
 
-#define PKT_TYPE_S(x)                   (((unsigned)(x) & 0x3) << 30)
-#define PKT_TYPE_G(x)                   (((x) >> 30) & 0x3)
-#define PKT_TYPE_C                      0x3FFFFFFF
-#define PKT_COUNT_S(x)                  (((unsigned)(x) & 0x3FFF) << 16)
-#define PKT_COUNT_G(x)                  (((x) >> 16) & 0x3FFF)
-#define PKT_COUNT_C                     0xC000FFFF
-#define PKT0_BASE_INDEX_S(x)            (((unsigned)(x) & 0xFFFF) << 0)
-#define PKT0_BASE_INDEX_G(x)            (((x) >> 0) & 0xFFFF)
-#define PKT0_BASE_INDEX_C               0xFFFF0000
-#define PKT3_IT_OPCODE_S(x)             (((unsigned)(x) & 0xFF) << 8)
-#define PKT3_IT_OPCODE_G(x)             (((x) >> 8) & 0xFF)
-#define PKT3_IT_OPCODE_C                0xFFFF00FF
-#define PKT3_PREDICATE(x)               (((x) >> 0) & 0x1)
-#define PKT3_SHADER_TYPE_S(x)           (((unsigned)(x) & 0x1) << 1)
-#define PKT0(index, count) (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count))
-#define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT_COUNT_S(count) | PKT3_IT_OPCODE_S(op) | PKT3_PREDICATE(predicate))
+#define PKT_TYPE_S(x)         (((unsigned)(x)&0x3) << 30)
+#define PKT_TYPE_G(x)         (((x) >> 30) & 0x3)
+#define PKT_TYPE_C            0x3FFFFFFF
+#define PKT_COUNT_S(x)        (((unsigned)(x)&0x3FFF) << 16)
+#define PKT_COUNT_G(x)        (((x) >> 16) & 0x3FFF)
+#define PKT_COUNT_C           0xC000FFFF
+#define PKT0_BASE_INDEX_S(x)  (((unsigned)(x)&0xFFFF) << 0)
+#define PKT0_BASE_INDEX_G(x)  (((x) >> 0) & 0xFFFF)
+#define PKT0_BASE_INDEX_C     0xFFFF0000
+#define PKT3_IT_OPCODE_S(x)   (((unsigned)(x)&0xFF) << 8)
+#define PKT3_IT_OPCODE_G(x)   (((x) >> 8) & 0xFF)
+#define PKT3_IT_OPCODE_C      0xFFFF00FF
+#define PKT3_PREDICATE(x)     (((x) >> 0) & 0x1)
+#define PKT3_SHADER_TYPE_S(x) (((unsigned)(x)&0x1) << 1)
+#define PKT0(index, count)    (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count))
+#define PKT3(op, count, predicate)                                                                 \
+   (PKT_TYPE_S(3) | PKT_COUNT_S(count) | PKT3_IT_OPCODE_S(op) | PKT3_PREDICATE(predicate))
 
-#define PKT2_NOP_PAD                    PKT_TYPE_S(2)
-#define PKT3_NOP_PAD                    PKT3(PKT3_NOP, 0x3fff, 0) /* header-only version */
+#define PKT2_NOP_PAD PKT_TYPE_S(2)
+#define PKT3_NOP_PAD PKT3(PKT3_NOP, 0x3fff, 0) /* header-only version */
 
-#define PKT3_CP_DMA                                    0x41
+#define PKT3_CP_DMA 0x41
 /* 1. header
  * 2. SRC_ADDR_LO [31:0] or DATA [31:0]
  * 3. CP_SYNC [31] | SRC_SEL [30:29] | ENGINE [27] | DST_SEL [21:20] | SRC_ADDR_HI [15:0]
  * 6. COMMAND [29:22] | BYTE_COUNT [20:0]
  */
 
-#define PKT3_DMA_DATA                                  0x50 /* new for CIK */
+#define PKT3_DMA_DATA 0x50 /* new for CIK */
 /* 1. header
  * 2. CP_SYNC [31] | SRC_SEL [30:29] | DST_SEL [21:20] | ENGINE [0]
  * 2. SRC_ADDR_LO [31:0] or DATA [31:0]
  */
 
 /* SI async DMA packets */
-#define SI_DMA_PACKET(cmd, sub_cmd, n) ((((unsigned)(cmd) & 0xF) << 28) |    \
-                                       (((unsigned)(sub_cmd) & 0xFF) << 20) |\
-                                       (((unsigned)(n) & 0xFFFFF) << 0))
+#define SI_DMA_PACKET(cmd, sub_cmd, n)                                                             \
+   ((((unsigned)(cmd)&0xF) << 28) | (((unsigned)(sub_cmd)&0xFF) << 20) |                           \
+    (((unsigned)(n)&0xFFFFF) << 0))
 /* SI async DMA Packet types */
-#define    SI_DMA_PACKET_WRITE                     0x2
-#define    SI_DMA_PACKET_COPY                      0x3
-#define    SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE       0xfffe0
+#define SI_DMA_PACKET_WRITE               0x2
+#define SI_DMA_PACKET_COPY                0x3
+#define SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE 0xfffe0
 /* The documentation says 0xffff8 is the maximum size in dwords, which is
  * 0x3fffe0 in bytes. */
-#define    SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE      0x3fffe0
-#define    SI_DMA_COPY_DWORD_ALIGNED               0x00
-#define    SI_DMA_COPY_BYTE_ALIGNED                0x40
-#define    SI_DMA_COPY_TILED                       0x8
-#define    SI_DMA_PACKET_INDIRECT_BUFFER           0x4
-#define    SI_DMA_PACKET_SEMAPHORE                 0x5
-#define    SI_DMA_PACKET_FENCE                     0x6
-#define    SI_DMA_PACKET_TRAP                      0x7
-#define    SI_DMA_PACKET_SRBM_WRITE                0x9
-#define    SI_DMA_PACKET_CONSTANT_FILL             0xd
-#define    SI_DMA_PACKET_NOP                       0xf
+#define SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE 0x3fffe0
+#define SI_DMA_COPY_DWORD_ALIGNED          0x00
+#define SI_DMA_COPY_BYTE_ALIGNED           0x40
+#define SI_DMA_COPY_TILED                  0x8
+#define SI_DMA_PACKET_INDIRECT_BUFFER      0x4
+#define SI_DMA_PACKET_SEMAPHORE            0x5
+#define SI_DMA_PACKET_FENCE                0x6
+#define SI_DMA_PACKET_TRAP                 0x7
+#define SI_DMA_PACKET_SRBM_WRITE           0x9
+#define SI_DMA_PACKET_CONSTANT_FILL        0xd
+#define SI_DMA_PACKET_NOP                  0xf
 
 /* CIK async DMA packets */
-#define CIK_SDMA_PACKET(op, sub_op, n)   ((((unsigned)(n) & 0xFFFF) << 16) |   \
-                                        (((unsigned)(sub_op) & 0xFF) << 8) |   \
-                                        (((unsigned)(op) & 0xFF) << 0))
+#define CIK_SDMA_PACKET(op, sub_op, n)                                                             \
+   ((((unsigned)(n)&0xFFFF) << 16) | (((unsigned)(sub_op)&0xFF) << 8) |                            \
+    (((unsigned)(op)&0xFF) << 0))
 /* CIK async DMA packet types */
-#define    CIK_SDMA_OPCODE_NOP                     0x0
-#define    CIK_SDMA_OPCODE_COPY                    0x1
-#define        CIK_SDMA_COPY_SUB_OPCODE_LINEAR            0x0
-#define        CIK_SDMA_COPY_SUB_OPCODE_TILED             0x1
-#define        CIK_SDMA_COPY_SUB_OPCODE_SOA               0x3
-#define        CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW 0x4
-#define        CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW  0x5
-#define        CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW    0x6
-#define    CIK_SDMA_OPCODE_WRITE                   0x2
-#define        SDMA_WRITE_SUB_OPCODE_LINEAR               0x0
-#define        SDMA_WRTIE_SUB_OPCODE_TILED                0x1
-#define    CIK_SDMA_OPCODE_INDIRECT_BUFFER         0x4
-#define    CIK_SDMA_PACKET_FENCE                   0x5
-#define    CIK_SDMA_PACKET_TRAP                    0x6
-#define    CIK_SDMA_PACKET_SEMAPHORE               0x7
-#define    CIK_SDMA_PACKET_CONSTANT_FILL           0xb
-#define    CIK_SDMA_OPCODE_TIMESTAMP               0xd
-#define        SDMA_TS_SUB_OPCODE_SET_LOCAL_TIMESTAMP     0x0
-#define        SDMA_TS_SUB_OPCODE_GET_LOCAL_TIMESTAMP     0x1
-#define        SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP    0x2
-#define    CIK_SDMA_PACKET_SRBM_WRITE              0xe
+#define CIK_SDMA_OPCODE_NOP                        0x0
+#define CIK_SDMA_OPCODE_COPY                       0x1
+#define CIK_SDMA_COPY_SUB_OPCODE_LINEAR            0x0
+#define CIK_SDMA_COPY_SUB_OPCODE_TILED             0x1
+#define CIK_SDMA_COPY_SUB_OPCODE_SOA               0x3
+#define CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW 0x4
+#define CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW  0x5
+#define CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW    0x6
+#define CIK_SDMA_OPCODE_WRITE                      0x2
+#define SDMA_WRITE_SUB_OPCODE_LINEAR               0x0
+#define SDMA_WRTIE_SUB_OPCODE_TILED                0x1
+#define CIK_SDMA_OPCODE_INDIRECT_BUFFER            0x4
+#define CIK_SDMA_PACKET_FENCE                      0x5
+#define CIK_SDMA_PACKET_TRAP                       0x6
+#define CIK_SDMA_PACKET_SEMAPHORE                  0x7
+#define CIK_SDMA_PACKET_CONSTANT_FILL              0xb
+#define CIK_SDMA_OPCODE_TIMESTAMP                  0xd
+#define SDMA_TS_SUB_OPCODE_SET_LOCAL_TIMESTAMP     0x0
+#define SDMA_TS_SUB_OPCODE_GET_LOCAL_TIMESTAMP     0x1
+#define SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP    0x2
+#define CIK_SDMA_PACKET_SRBM_WRITE                 0xe
 /* There is apparently an undocumented HW limitation that
    prevents the HW from copying the last 255 bytes of (1 << 22) - 1 */
-#define    CIK_SDMA_COPY_MAX_SIZE                  0x3fff00  /* almost 4 MB*/
-#define    GFX103_SDMA_COPY_MAX_SIZE               0x3fffff00 /* almost 1 GB */
+#define CIK_SDMA_COPY_MAX_SIZE    0x3fff00   /* almost 4 MB*/
+#define GFX103_SDMA_COPY_MAX_SIZE 0x3fffff00 /* almost 1 GB */
 
-enum amd_cmp_class_flags {
-       S_NAN = 1 << 0,        // Signaling NaN
-       Q_NAN = 1 << 1,        // Quiet NaN
-       N_INFINITY = 1 << 2,   // Negative infinity
-       N_NORMAL = 1 << 3,     // Negative normal
-       N_SUBNORMAL = 1 << 4,  // Negative subnormal
-       N_ZERO = 1 << 5,       // Negative zero
-       P_ZERO = 1 << 6,       // Positive zero
-       P_SUBNORMAL = 1 << 7,  // Positive subnormal
-       P_NORMAL = 1 << 8,     // Positive normal
-       P_INFINITY = 1 << 9    // Positive infinity
+enum amd_cmp_class_flags
+{
+   S_NAN = 1 << 0,       // Signaling NaN
+   Q_NAN = 1 << 1,       // Quiet NaN
+   N_INFINITY = 1 << 2,  // Negative infinity
+   N_NORMAL = 1 << 3,    // Negative normal
+   N_SUBNORMAL = 1 << 4, // Negative subnormal
+   N_ZERO = 1 << 5,      // Negative zero
+   P_ZERO = 1 << 6,      // Positive zero
+   P_SUBNORMAL = 1 << 7, // Positive subnormal
+   P_NORMAL = 1 << 8,    // Positive normal
+   P_INFINITY = 1 << 9   // Positive infinity
 };
 
 #endif /* _SID_H */