sysdeps/x86/dl-cacheinfo.h

   1 /* Initialize x86 cache info.
   2    Copyright (C) 2020-2022 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 static const struct intel_02_cache_info
  20 {
  21   unsigned char idx;
  22   unsigned char assoc;
  23   unsigned char linesize;
  24   unsigned char rel_name;
  25   unsigned int size;
  26 } intel_02_known [] =
  27   {
  28 #define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
  29     { 0x06,  4, 32, M(_SC_LEVEL1_ICACHE_SIZE),    8192 },
  30     { 0x08,  4, 32, M(_SC_LEVEL1_ICACHE_SIZE),   16384 },
  31     { 0x09,  4, 32, M(_SC_LEVEL1_ICACHE_SIZE),   32768 },
  32     { 0x0a,  2, 32, M(_SC_LEVEL1_DCACHE_SIZE),    8192 },
  33     { 0x0c,  4, 32, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
  34     { 0x0d,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
  35     { 0x0e,  6, 64, M(_SC_LEVEL1_DCACHE_SIZE),   24576 },
  36     { 0x21,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  37     { 0x22,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),   524288 },
  38     { 0x23,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
  39     { 0x25,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
  40     { 0x29,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
  41     { 0x2c,  8, 64, M(_SC_LEVEL1_DCACHE_SIZE),   32768 },
  42     { 0x30,  8, 64, M(_SC_LEVEL1_ICACHE_SIZE),   32768 },
  43     { 0x39,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
  44     { 0x3a,  6, 64, M(_SC_LEVEL2_CACHE_SIZE),   196608 },
  45     { 0x3b,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
  46     { 0x3c,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  47     { 0x3d,  6, 64, M(_SC_LEVEL2_CACHE_SIZE),   393216 },
  48     { 0x3e,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  49     { 0x3f,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  50     { 0x41,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
  51     { 0x42,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  52     { 0x43,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  53     { 0x44,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
  54     { 0x45,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
  55     { 0x46,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
  56     { 0x47,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
  57     { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE),  3145728 },
  58     { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE),  4194304 },
  59     { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  6291456 },
  60     { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
  61     { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
  62     { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 16777216 },
  63     { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE),  6291456 },
  64     { 0x60,  8, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
  65     { 0x66,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),    8192 },
  66     { 0x67,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
  67     { 0x68,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   32768 },
  68     { 0x78,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
  69     { 0x79,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
  70     { 0x7a,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  71     { 0x7b,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  72     { 0x7c,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
  73     { 0x7d,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
  74     { 0x7f,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  75     { 0x80,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  76     { 0x82,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
  77     { 0x83,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  78     { 0x84,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
  79     { 0x85,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
  80     { 0x86,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
  81     { 0x87,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
  82     { 0xd0,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),   524288 },
  83     { 0xd1,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
  84     { 0xd2,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
  85     { 0xd6,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
  86     { 0xd7,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
  87     { 0xd8,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
  88     { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
  89     { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
  90     { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
  91     { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
  92     { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
  93     { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
  94     { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
  95     { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 },
  96     { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 },
  97   };
  98
  99 #define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
 100
 101 static int
 102 intel_02_known_compare (const void *p1, const void *p2)
 103 {
 104   const struct intel_02_cache_info *i1;
 105   const struct intel_02_cache_info *i2;
 106
 107   i1 = (const struct intel_02_cache_info *) p1;
 108   i2 = (const struct intel_02_cache_info *) p2;
 109
 110   if (i1->idx == i2->idx)
 111     return 0;
 112
 113   return i1->idx < i2->idx ? -1 : 1;
 114 }
 115
 116
 117 static long int
 118 __attribute__ ((noinline))
 119 intel_check_word (int name, unsigned int value, bool *has_level_2,
 120                   bool *no_level_2_or_3,
 121                   const struct cpu_features *cpu_features)
 122 {
 123   if ((value & 0x80000000) != 0)
 124     /* The register value is reserved.  */
 125     return 0;
 126
 127   /* Fold the name.  The _SC_ constants are always in the order SIZE,
 128      ASSOC, LINESIZE.  */
 129   int folded_rel_name = (M(name) / 3) * 3;
 130
 131   while (value != 0)
 132     {
 133       unsigned int byte = value & 0xff;
 134
 135       if (byte == 0x40)
 136         {
 137           *no_level_2_or_3 = true;
 138
 139           if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
 140             /* No need to look further.  */
 141             break;
 142         }
 143       else if (byte == 0xff)
 144         {
 145           /* CPUID leaf 0x4 contains all the information.  We need to
 146              iterate over it.  */
 147           unsigned int eax;
 148           unsigned int ebx;
 149           unsigned int ecx;
 150           unsigned int edx;
 151
 152           unsigned int round = 0;
 153           while (1)
 154             {
 155               __cpuid_count (4, round, eax, ebx, ecx, edx);
 156
 157               enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
 158               if (type == null)
 159                 /* That was the end.  */
 160                 break;
 161
 162               unsigned int level = (eax >> 5) & 0x7;
 163
 164               if ((level == 1 && type == data
 165                    && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
 166                   || (level == 1 && type == inst
 167                       && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
 168                   || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
 169                   || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
 170                   || (level == 4 && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
 171                 {
 172                   unsigned int offset = M(name) - folded_rel_name;
 173
 174                   if (offset == 0)
 175                     /* Cache size.  */
 176                     return (((ebx >> 22) + 1)
 177                             * (((ebx >> 12) & 0x3ff) + 1)
 178                             * ((ebx & 0xfff) + 1)
 179                             * (ecx + 1));
 180                   if (offset == 1)
 181                     return (ebx >> 22) + 1;
 182
 183                   assert (offset == 2);
 184                   return (ebx & 0xfff) + 1;
 185                 }
 186
 187               ++round;
 188             }
 189           /* There is no other cache information anywhere else.  */
 190           break;
 191         }
 192       else
 193         {
 194           if (byte == 0x49 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
 195             {
 196               /* Intel reused this value.  For family 15, model 6 it
 197                  specifies the 3rd level cache.  Otherwise the 2nd
 198                  level cache.  */
 199               unsigned int family = cpu_features->basic.family;
 200               unsigned int model = cpu_features->basic.model;
 201
 202               if (family == 15 && model == 6)
 203                 {
 204                   /* The level 3 cache is encoded for this model like
 205                      the level 2 cache is for other models.  Pretend
 206                      the caller asked for the level 2 cache.  */
 207                   name = (_SC_LEVEL2_CACHE_SIZE
 208                           + (name - _SC_LEVEL3_CACHE_SIZE));
 209                   folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
 210                 }
 211             }
 212
 213           struct intel_02_cache_info *found;
 214           struct intel_02_cache_info search;
 215
 216           search.idx = byte;
 217           found = bsearch (&search, intel_02_known, nintel_02_known,
 218                            sizeof (intel_02_known[0]), intel_02_known_compare);
 219           if (found != NULL)
 220             {
 221               if (found->rel_name == folded_rel_name)
 222                 {
 223                   unsigned int offset = M(name) - folded_rel_name;
 224
 225                   if (offset == 0)
 226                     /* Cache size.  */
 227                     return found->size;
 228                   if (offset == 1)
 229                     return found->assoc;
 230
 231                   assert (offset == 2);
 232                   return found->linesize;
 233                 }
 234
 235               if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
 236                 *has_level_2 = true;
 237             }
 238         }
 239
 240       /* Next byte for the next round.  */
 241       value >>= 8;
 242     }
 243
 244   /* Nothing found.  */
 245   return 0;
 246 }
 247
 248
 249 static long int __attribute__ ((noinline))
 250 handle_intel (int name, const struct cpu_features *cpu_features)
 251 {
 252   unsigned int maxidx = cpu_features->basic.max_cpuid;
 253
 254   /* Return -1 for older CPUs.  */
 255   if (maxidx < 2)
 256     return -1;
 257
 258   /* OK, we can use the CPUID instruction to get all info about the
 259      caches.  */
 260   unsigned int cnt = 0;
 261   unsigned int max = 1;
 262   long int result = 0;
 263   bool no_level_2_or_3 = false;
 264   bool has_level_2 = false;
 265
 266   while (cnt++ < max)
 267     {
 268       unsigned int eax;
 269       unsigned int ebx;
 270       unsigned int ecx;
 271       unsigned int edx;
 272       __cpuid (2, eax, ebx, ecx, edx);
 273
 274       /* The low byte of EAX in the first round contain the number of
 275          rounds we have to make.  At least one, the one we are already
 276          doing.  */
 277       if (cnt == 1)
 278         {
 279           max = eax & 0xff;
 280           eax &= 0xffffff00;
 281         }
 282
 283       /* Process the individual registers' value.  */
 284       result = intel_check_word (name, eax, &has_level_2,
 285                                  &no_level_2_or_3, cpu_features);
 286       if (result != 0)
 287         return result;
 288
 289       result = intel_check_word (name, ebx, &has_level_2,
 290                                  &no_level_2_or_3, cpu_features);
 291       if (result != 0)
 292         return result;
 293
 294       result = intel_check_word (name, ecx, &has_level_2,
 295                                  &no_level_2_or_3, cpu_features);
 296       if (result != 0)
 297         return result;
 298
 299       result = intel_check_word (name, edx, &has_level_2,
 300                                  &no_level_2_or_3, cpu_features);
 301       if (result != 0)
 302         return result;
 303     }
 304
 305   if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
 306       && no_level_2_or_3)
 307     return -1;
 308
 309   return 0;
 310 }
 311
 312
 313 static long int __attribute__ ((noinline))
 314 handle_amd (int name)
 315 {
 316   unsigned int eax;
 317   unsigned int ebx;
 318   unsigned int ecx;
 319   unsigned int edx;
 320   __cpuid (0x80000000, eax, ebx, ecx, edx);
 321
 322   /* No level 4 cache (yet).  */
 323   if (name > _SC_LEVEL3_CACHE_LINESIZE)
 324     return 0;
 325
 326   unsigned int fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
 327   if (eax < fn)
 328     return 0;
 329
 330   __cpuid (fn, eax, ebx, ecx, edx);
 331
 332   if (name < _SC_LEVEL1_DCACHE_SIZE)
 333     {
 334       name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
 335       ecx = edx;
 336     }
 337
 338   switch (name)
 339     {
 340     case _SC_LEVEL1_DCACHE_SIZE:
 341       return (ecx >> 14) & 0x3fc00;
 342
 343     case _SC_LEVEL1_DCACHE_ASSOC:
 344       ecx >>= 16;
 345       if ((ecx & 0xff) == 0xff)
 346         /* Fully associative.  */
 347         return (ecx << 2) & 0x3fc00;
 348       return ecx & 0xff;
 349
 350     case _SC_LEVEL1_DCACHE_LINESIZE:
 351       return ecx & 0xff;
 352
 353     case _SC_LEVEL2_CACHE_SIZE:
 354       return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
 355
 356     case _SC_LEVEL2_CACHE_ASSOC:
 357       switch ((ecx >> 12) & 0xf)
 358         {
 359         case 0:
 360         case 1:
 361         case 2:
 362         case 4:
 363           return (ecx >> 12) & 0xf;
 364         case 6:
 365           return 8;
 366         case 8:
 367           return 16;
 368         case 10:
 369           return 32;
 370         case 11:
 371           return 48;
 372         case 12:
 373           return 64;
 374         case 13:
 375           return 96;
 376         case 14:
 377           return 128;
 378         case 15:
 379           return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff);
 380         default:
 381           return 0;
 382         }
 383       /* NOTREACHED */
 384
 385     case _SC_LEVEL2_CACHE_LINESIZE:
 386       return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
 387
 388     case _SC_LEVEL3_CACHE_SIZE:
 389       return (edx & 0xf000) == 0 ? 0 : (edx & 0x3ffc0000) << 1;
 390
 391     case _SC_LEVEL3_CACHE_ASSOC:
 392       switch ((edx >> 12) & 0xf)
 393         {
 394         case 0:
 395         case 1:
 396         case 2:
 397         case 4:
 398           return (edx >> 12) & 0xf;
 399         case 6:
 400           return 8;
 401         case 8:
 402           return 16;
 403         case 10:
 404           return 32;
 405         case 11:
 406           return 48;
 407         case 12:
 408           return 64;
 409         case 13:
 410           return 96;
 411         case 14:
 412           return 128;
 413         case 15:
 414           return ((edx & 0x3ffc0000) << 1) / (edx & 0xff);
 415         default:
 416           return 0;
 417         }
 418       /* NOTREACHED */
 419
 420     case _SC_LEVEL3_CACHE_LINESIZE:
 421       return (edx & 0xf000) == 0 ? 0 : edx & 0xff;
 422
 423     default:
 424       assert (! "cannot happen");
 425     }
 426   return -1;
 427 }
 428
 429
 430 static long int __attribute__ ((noinline))
 431 handle_zhaoxin (int name)
 432 {
 433   unsigned int eax;
 434   unsigned int ebx;
 435   unsigned int ecx;
 436   unsigned int edx;
 437
 438   int folded_rel_name = (M(name) / 3) * 3;
 439
 440   unsigned int round = 0;
 441   while (1)
 442     {
 443       __cpuid_count (4, round, eax, ebx, ecx, edx);
 444
 445       enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
 446       if (type == null)
 447         break;
 448
 449       unsigned int level = (eax >> 5) & 0x7;
 450
 451       if ((level == 1 && type == data
 452         && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
 453         || (level == 1 && type == inst
 454             && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
 455         || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
 456         || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
 457         {
 458           unsigned int offset = M(name) - folded_rel_name;
 459
 460           if (offset == 0)
 461             /* Cache size.  */
 462             return (((ebx >> 22) + 1)
 463                 * (((ebx >> 12) & 0x3ff) + 1)
 464                 * ((ebx & 0xfff) + 1)
 465                 * (ecx + 1));
 466           if (offset == 1)
 467             return (ebx >> 22) + 1;
 468
 469           assert (offset == 2);
 470           return (ebx & 0xfff) + 1;
 471         }
 472
 473       ++round;
 474     }
 475
 476   /* Nothing found.  */
 477   return 0;
 478 }
 479
 480 static void
 481 get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
 482                 long int core)
 483 {
 484   unsigned int eax;
 485   unsigned int ebx;
 486   unsigned int ecx;
 487   unsigned int edx;
 488
 489   /* Number of logical processors sharing L2 cache.  */
 490   int threads_l2;
 491
 492   /* Number of logical processors sharing L3 cache.  */
 493   int threads_l3;
 494
 495   const struct cpu_features *cpu_features = __get_cpu_features ();
 496   int max_cpuid = cpu_features->basic.max_cpuid;
 497   unsigned int family = cpu_features->basic.family;
 498   unsigned int model = cpu_features->basic.model;
 499   long int shared = *shared_ptr;
 500   unsigned int threads = *threads_ptr;
 501   bool inclusive_cache = true;
 502   bool support_count_mask = true;
 503
 504   /* Try L3 first.  */
 505   unsigned int level = 3;
 506
 507   if (cpu_features->basic.kind == arch_kind_zhaoxin && family == 6)
 508     support_count_mask = false;
 509
 510   if (shared <= 0)
 511     {
 512       /* Try L2 otherwise.  */
 513       level  = 2;
 514       shared = core;
 515       threads_l2 = 0;
 516       threads_l3 = -1;
 517     }
 518   else
 519     {
 520       threads_l2 = 0;
 521       threads_l3 = 0;
 522     }
 523
 524   /* A value of 0 for the HTT bit indicates there is only a single
 525      logical processor.  */
 526   if (HAS_CPU_FEATURE (HTT))
 527     {
 528       /* Figure out the number of logical threads that share the
 529          highest cache level.  */
 530       if (max_cpuid >= 4)
 531         {
 532           int i = 0;
 533
 534           /* Query until cache level 2 and 3 are enumerated.  */
 535           int check = 0x1 | (threads_l3 == 0) << 1;
 536           do
 537             {
 538               __cpuid_count (4, i++, eax, ebx, ecx, edx);
 539
 540               /* There seems to be a bug in at least some Pentium Ds
 541                  which sometimes fail to iterate all cache parameters.
 542                  Do not loop indefinitely here, stop in this case and
 543                  assume there is no such information.  */
 544               if (cpu_features->basic.kind == arch_kind_intel
 545                   && (eax & 0x1f) == 0 )
 546                 goto intel_bug_no_cache_info;
 547
 548               switch ((eax >> 5) & 0x7)
 549                 {
 550                   default:
 551                     break;
 552                   case 2:
 553                     if ((check & 0x1))
 554                       {
 555                         /* Get maximum number of logical processors
 556                            sharing L2 cache.  */
 557                         threads_l2 = (eax >> 14) & 0x3ff;
 558                         check &= ~0x1;
 559                       }
 560                     break;
 561                   case 3:
 562                     if ((check & (0x1 << 1)))
 563                       {
 564                         /* Get maximum number of logical processors
 565                            sharing L3 cache.  */
 566                         threads_l3 = (eax >> 14) & 0x3ff;
 567
 568                         /* Check if L2 and L3 caches are inclusive.  */
 569                         inclusive_cache = (edx & 0x2) != 0;
 570                         check &= ~(0x1 << 1);
 571                       }
 572                     break;
 573                 }
 574             }
 575           while (check);
 576
 577           /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
 578              numbers of addressable IDs for logical processors sharing
 579              the cache, instead of the maximum number of threads
 580              sharing the cache.  */
 581           if (max_cpuid >= 11 && support_count_mask)
 582             {
 583               /* Find the number of logical processors shipped in
 584                  one core and apply count mask.  */
 585               i = 0;
 586
 587               /* Count SMT only if there is L3 cache.  Always count
 588                  core if there is no L3 cache.  */
 589               int count = ((threads_l2 > 0 && level == 3)
 590                            | ((threads_l3 > 0
 591                                || (threads_l2 > 0 && level == 2)) << 1));
 592
 593               while (count)
 594                 {
 595                   __cpuid_count (11, i++, eax, ebx, ecx, edx);
 596
 597                   int shipped = ebx & 0xff;
 598                   int type = ecx & 0xff00;
 599                   if (shipped == 0 || type == 0)
 600                     break;
 601                   else if (type == 0x100)
 602                     {
 603                       /* Count SMT.  */
 604                       if ((count & 0x1))
 605                         {
 606                           int count_mask;
 607
 608                           /* Compute count mask.  */
 609                           asm ("bsr %1, %0"
 610                                : "=r" (count_mask) : "g" (threads_l2));
 611                           count_mask = ~(-1 << (count_mask + 1));
 612                           threads_l2 = (shipped - 1) & count_mask;
 613                           count &= ~0x1;
 614                         }
 615                     }
 616                   else if (type == 0x200)
 617                     {
 618                       /* Count core.  */
 619                       if ((count & (0x1 << 1)))
 620                         {
 621                           int count_mask;
 622                           int threads_core
 623                             = (level == 2 ? threads_l2 : threads_l3);
 624
 625                           /* Compute count mask.  */
 626                           asm ("bsr %1, %0"
 627                                : "=r" (count_mask) : "g" (threads_core));
 628                           count_mask = ~(-1 << (count_mask + 1));
 629                           threads_core = (shipped - 1) & count_mask;
 630                           if (level == 2)
 631                             threads_l2 = threads_core;
 632                           else
 633                             threads_l3 = threads_core;
 634                           count &= ~(0x1 << 1);
 635                         }
 636                     }
 637                 }
 638             }
 639           if (threads_l2 > 0)
 640             threads_l2 += 1;
 641           if (threads_l3 > 0)
 642             threads_l3 += 1;
 643           if (level == 2)
 644             {
 645               if (threads_l2)
 646                 {
 647                   threads = threads_l2;
 648                   if (cpu_features->basic.kind == arch_kind_intel
 649                       && threads > 2
 650                       && family == 6)
 651                     switch (model)
 652                       {
 653                         case 0x37:
 654                         case 0x4a:
 655                         case 0x4d:
 656                         case 0x5a:
 657                         case 0x5d:
 658                           /* Silvermont has L2 cache shared by 2 cores.  */
 659                           threads = 2;
 660                           break;
 661                         default:
 662                           break;
 663                       }
 664                 }
 665             }
 666           else if (threads_l3)
 667             threads = threads_l3;
 668         }
 669       else
 670         {
 671 intel_bug_no_cache_info:
 672           /* Assume that all logical threads share the highest cache
 673              level.  */
 674           threads
 675             = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16)
 676                & 0xff);
 677         }
 678
 679         /* Cap usage of highest cache level to the number of supported
 680            threads.  */
 681         if (shared > 0 && threads > 0)
 682           shared /= threads;
 683     }
 684
 685   /* Account for non-inclusive L2 and L3 caches.  */
 686   if (!inclusive_cache)
 687     {
 688       if (threads_l2 > 0)
 689         core /= threads_l2;
 690       shared += core;
 691     }
 692
 693   *shared_ptr = shared;
 694   *threads_ptr = threads;
 695 }
 696
 697 static void
 698 dl_init_cacheinfo (struct cpu_features *cpu_features)
 699 {
 700   /* Find out what brand of processor.  */
 701   unsigned int ebx;
 702   unsigned int ecx;
 703   unsigned int edx;
 704   int max_cpuid_ex;
 705   long int data = -1;
 706   long int shared = -1;
 707   long int core = -1;
 708   unsigned int threads = 0;
 709   unsigned long int level1_icache_size = -1;
 710   unsigned long int level1_icache_linesize = -1;
 711   unsigned long int level1_dcache_size = -1;
 712   unsigned long int level1_dcache_assoc = -1;
 713   unsigned long int level1_dcache_linesize = -1;
 714   unsigned long int level2_cache_size = -1;
 715   unsigned long int level2_cache_assoc = -1;
 716   unsigned long int level2_cache_linesize = -1;
 717   unsigned long int level3_cache_size = -1;
 718   unsigned long int level3_cache_assoc = -1;
 719   unsigned long int level3_cache_linesize = -1;
 720   unsigned long int level4_cache_size = -1;
 721
 722   if (cpu_features->basic.kind == arch_kind_intel)
 723     {
 724       data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
 725       core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
 726       shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
 727
 728       level1_icache_size
 729         = handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
 730       level1_icache_linesize
 731         = handle_intel (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
 732       level1_dcache_size = data;
 733       level1_dcache_assoc
 734         = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
 735       level1_dcache_linesize
 736         = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
 737       level2_cache_size = core;
 738       level2_cache_assoc
 739         = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
 740       level2_cache_linesize
 741         = handle_intel (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
 742       level3_cache_size = shared;
 743       level3_cache_assoc
 744         = handle_intel (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
 745       level3_cache_linesize
 746         = handle_intel (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
 747       level4_cache_size
 748         = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
 749
 750       get_common_cache_info (&shared, &threads, core);
 751     }
 752   else if (cpu_features->basic.kind == arch_kind_zhaoxin)
 753     {
 754       data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
 755       core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
 756       shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
 757
 758       level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
 759       level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
 760       level1_dcache_size = data;
 761       level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
 762       level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
 763       level2_cache_size = core;
 764       level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
 765       level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
 766       level3_cache_size = shared;
 767       level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
 768       level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
 769
 770       get_common_cache_info (&shared, &threads, core);
 771     }
 772   else if (cpu_features->basic.kind == arch_kind_amd)
 773     {
 774       data  = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
 775       core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
 776       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
 777
 778       level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
 779       level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE);
 780       level1_dcache_size = data;
 781       level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
 782       level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
 783       level2_cache_size = core;
 784       level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
 785       level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
 786       level3_cache_size = shared;
 787       level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC);
 788       level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE);
 789
 790       /* Get maximum extended function. */
 791       __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
 792
 793       if (shared <= 0)
 794         /* No shared L3 cache.  All we have is the L2 cache.  */
 795         shared = core;
 796       else
 797         {
 798           /* Figure out the number of logical threads that share L3.  */
 799           if (max_cpuid_ex >= 0x80000008)
 800             {
 801               /* Get width of APIC ID.  */
 802               __cpuid (0x80000008, max_cpuid_ex, ebx, ecx, edx);
 803               threads = 1 << ((ecx >> 12) & 0x0f);
 804             }
 805
 806           if (threads == 0 || cpu_features->basic.family >= 0x17)
 807             {
 808               /* If APIC ID width is not available, use logical
 809                  processor count.  */
 810               __cpuid (0x00000001, max_cpuid_ex, ebx, ecx, edx);
 811
 812               if ((edx & (1 << 28)) != 0)
 813                 threads = (ebx >> 16) & 0xff;
 814             }
 815
 816           /* Cap usage of highest cache level to the number of
 817              supported threads.  */
 818           if (threads > 0)
 819             shared /= threads;
 820
 821           /* Get shared cache per ccx for Zen architectures.  */
 822           if (cpu_features->basic.family >= 0x17)
 823             {
 824               unsigned int eax;
 825
 826               /* Get number of threads share the L3 cache in CCX.  */
 827               __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
 828
 829               unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
 830               shared *= threads_per_ccx;
 831             }
 832           else
 833             {
 834               /* Account for exclusive L2 and L3 caches.  */
 835               shared += core;
 836             }
 837         }
 838     }
 839
 840   cpu_features->level1_icache_size = level1_icache_size;
 841   cpu_features->level1_icache_linesize = level1_icache_linesize;
 842   cpu_features->level1_dcache_size = level1_dcache_size;
 843   cpu_features->level1_dcache_assoc = level1_dcache_assoc;
 844   cpu_features->level1_dcache_linesize = level1_dcache_linesize;
 845   cpu_features->level2_cache_size = level2_cache_size;
 846   cpu_features->level2_cache_assoc = level2_cache_assoc;
 847   cpu_features->level2_cache_linesize = level2_cache_linesize;
 848   cpu_features->level3_cache_size = level3_cache_size;
 849   cpu_features->level3_cache_assoc = level3_cache_assoc;
 850   cpu_features->level3_cache_linesize = level3_cache_linesize;
 851   cpu_features->level4_cache_size = level4_cache_size;
 852
 853   /* The default setting for the non_temporal threshold is 3/4 of one
 854      thread's share of the chip's cache. For most Intel and AMD processors
 855      with an initial release date between 2017 and 2020, a thread's typical
 856      share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
 857      threshold leaves 125 KBytes to 500 KBytes of the thread's data
 858      in cache after a maximum temporal copy, which will maintain
 859      in cache a reasonable portion of the thread's stack and other
 860      active data. If the threshold is set higher than one thread's
 861      share of the cache, it has a substantial risk of negatively
 862      impacting the performance of other threads running on the chip. */
 863   unsigned long int non_temporal_threshold = shared * 3 / 4;
 864   /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
 865      'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
 866      if that operation cannot overflow. Minimum of 0x4040 (16448) because the
 867      L(large_memset_4x) loops need 64-byte to cache align and enough space for
 868      at least 1 iteration of 4x PAGE_SIZE unrolled loop.  Both values are
 869      reflected in the manual.  */
 870   unsigned long int maximum_non_temporal_threshold = SIZE_MAX >> 4;
 871   unsigned long int minimum_non_temporal_threshold = 0x4040;
 872   if (non_temporal_threshold < minimum_non_temporal_threshold)
 873     non_temporal_threshold = minimum_non_temporal_threshold;
 874   else if (non_temporal_threshold > maximum_non_temporal_threshold)
 875     non_temporal_threshold = maximum_non_temporal_threshold;
 876
 877 #if HAVE_TUNABLES
 878   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
 879   unsigned int minimum_rep_movsb_threshold;
 880 #endif
 881   /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
 882      VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
 883      threshold is 2048 * (VEC_SIZE / 16).  */
 884   unsigned int rep_movsb_threshold;
 885   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
 886       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
 887     {
 888       rep_movsb_threshold = 4096 * (64 / 16);
 889 #if HAVE_TUNABLES
 890       minimum_rep_movsb_threshold = 64 * 8;
 891 #endif
 892     }
 893   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
 894                                     AVX_Fast_Unaligned_Load))
 895     {
 896       rep_movsb_threshold = 4096 * (32 / 16);
 897 #if HAVE_TUNABLES
 898       minimum_rep_movsb_threshold = 32 * 8;
 899 #endif
 900     }
 901   else
 902     {
 903       rep_movsb_threshold = 2048 * (16 / 16);
 904 #if HAVE_TUNABLES
 905       minimum_rep_movsb_threshold = 16 * 8;
 906 #endif
 907     }
 908   /* NB: The default REP MOVSB threshold is 2112 on processors with fast
 909      short REP MOVSB (FSRM).  */
 910   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
 911     rep_movsb_threshold = 2112;
 912
 913   /* The default threshold to use Enhanced REP STOSB.  */
 914   unsigned long int rep_stosb_threshold = 2048;
 915
 916 #if HAVE_TUNABLES
 917   long int tunable_size;
 918
 919   tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
 920   /* NB: Ignore the default value 0.  */
 921   if (tunable_size != 0)
 922     data = tunable_size;
 923
 924   tunable_size = TUNABLE_GET (x86_shared_cache_size, long int, NULL);
 925   /* NB: Ignore the default value 0.  */
 926   if (tunable_size != 0)
 927     shared = tunable_size;
 928
 929   tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
 930   if (tunable_size > minimum_non_temporal_threshold
 931       && tunable_size <= maximum_non_temporal_threshold)
 932     non_temporal_threshold = tunable_size;
 933
 934   tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
 935   if (tunable_size > minimum_rep_movsb_threshold)
 936     rep_movsb_threshold = tunable_size;
 937
 938   /* NB: The default value of the x86_rep_stosb_threshold tunable is the
 939      same as the default value of __x86_rep_stosb_threshold and the
 940      minimum value is fixed.  */
 941   rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
 942                                      long int, NULL);
 943
 944   TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
 945   TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
 946   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
 947                            minimum_non_temporal_threshold,
 948                            maximum_non_temporal_threshold);
 949   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
 950                            minimum_rep_movsb_threshold, SIZE_MAX);
 951   TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
 952                            SIZE_MAX);
 953 #endif
 954
 955   unsigned long int rep_movsb_stop_threshold;
 956   /* ERMS feature is implemented from AMD Zen3 architecture and it is
 957      performing poorly for data above L2 cache size. Henceforth, adding
 958      an upper bound threshold parameter to limit the usage of Enhanced
 959      REP MOVSB operations and setting its value to L2 cache size.  */
 960   if (cpu_features->basic.kind == arch_kind_amd)
 961     rep_movsb_stop_threshold = core;
 962   /* Setting the upper bound of ERMS to the computed value of
 963      non-temporal threshold for architectures other than AMD.  */
 964   else
 965     rep_movsb_stop_threshold = non_temporal_threshold;
 966
 967   cpu_features->data_cache_size = data;
 968   cpu_features->shared_cache_size = shared;
 969   cpu_features->non_temporal_threshold = non_temporal_threshold;
 970   cpu_features->rep_movsb_threshold = rep_movsb_threshold;
 971   cpu_features->rep_stosb_threshold = rep_stosb_threshold;
 972   cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
 973 }