1 /* Initialize x86 cache info.
2 Copyright (C) 2020-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 static const struct intel_02_cache_info
23 unsigned char linesize
;
24 unsigned char rel_name
;
28 #define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
29 { 0x06, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE
), 8192 },
30 { 0x08, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE
), 16384 },
31 { 0x09, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE
), 32768 },
32 { 0x0a, 2, 32, M(_SC_LEVEL1_DCACHE_SIZE
), 8192 },
33 { 0x0c, 4, 32, M(_SC_LEVEL1_DCACHE_SIZE
), 16384 },
34 { 0x0d, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE
), 16384 },
35 { 0x0e, 6, 64, M(_SC_LEVEL1_DCACHE_SIZE
), 24576 },
36 { 0x21, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 262144 },
37 { 0x22, 4, 64, M(_SC_LEVEL3_CACHE_SIZE
), 524288 },
38 { 0x23, 8, 64, M(_SC_LEVEL3_CACHE_SIZE
), 1048576 },
39 { 0x25, 8, 64, M(_SC_LEVEL3_CACHE_SIZE
), 2097152 },
40 { 0x29, 8, 64, M(_SC_LEVEL3_CACHE_SIZE
), 4194304 },
41 { 0x2c, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE
), 32768 },
42 { 0x30, 8, 64, M(_SC_LEVEL1_ICACHE_SIZE
), 32768 },
43 { 0x39, 4, 64, M(_SC_LEVEL2_CACHE_SIZE
), 131072 },
44 { 0x3a, 6, 64, M(_SC_LEVEL2_CACHE_SIZE
), 196608 },
45 { 0x3b, 2, 64, M(_SC_LEVEL2_CACHE_SIZE
), 131072 },
46 { 0x3c, 4, 64, M(_SC_LEVEL2_CACHE_SIZE
), 262144 },
47 { 0x3d, 6, 64, M(_SC_LEVEL2_CACHE_SIZE
), 393216 },
48 { 0x3e, 4, 64, M(_SC_LEVEL2_CACHE_SIZE
), 524288 },
49 { 0x3f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE
), 262144 },
50 { 0x41, 4, 32, M(_SC_LEVEL2_CACHE_SIZE
), 131072 },
51 { 0x42, 4, 32, M(_SC_LEVEL2_CACHE_SIZE
), 262144 },
52 { 0x43, 4, 32, M(_SC_LEVEL2_CACHE_SIZE
), 524288 },
53 { 0x44, 4, 32, M(_SC_LEVEL2_CACHE_SIZE
), 1048576 },
54 { 0x45, 4, 32, M(_SC_LEVEL2_CACHE_SIZE
), 2097152 },
55 { 0x46, 4, 64, M(_SC_LEVEL3_CACHE_SIZE
), 4194304 },
56 { 0x47, 8, 64, M(_SC_LEVEL3_CACHE_SIZE
), 8388608 },
57 { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE
), 3145728 },
58 { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE
), 4194304 },
59 { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE
), 6291456 },
60 { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE
), 8388608 },
61 { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE
), 12582912 },
62 { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE
), 16777216 },
63 { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE
), 6291456 },
64 { 0x60, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE
), 16384 },
65 { 0x66, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE
), 8192 },
66 { 0x67, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE
), 16384 },
67 { 0x68, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE
), 32768 },
68 { 0x78, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 1048576 },
69 { 0x79, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 131072 },
70 { 0x7a, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 262144 },
71 { 0x7b, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 524288 },
72 { 0x7c, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 1048576 },
73 { 0x7d, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 2097152 },
74 { 0x7f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE
), 524288 },
75 { 0x80, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 524288 },
76 { 0x82, 8, 32, M(_SC_LEVEL2_CACHE_SIZE
), 262144 },
77 { 0x83, 8, 32, M(_SC_LEVEL2_CACHE_SIZE
), 524288 },
78 { 0x84, 8, 32, M(_SC_LEVEL2_CACHE_SIZE
), 1048576 },
79 { 0x85, 8, 32, M(_SC_LEVEL2_CACHE_SIZE
), 2097152 },
80 { 0x86, 4, 64, M(_SC_LEVEL2_CACHE_SIZE
), 524288 },
81 { 0x87, 8, 64, M(_SC_LEVEL2_CACHE_SIZE
), 1048576 },
82 { 0xd0, 4, 64, M(_SC_LEVEL3_CACHE_SIZE
), 524288 },
83 { 0xd1, 4, 64, M(_SC_LEVEL3_CACHE_SIZE
), 1048576 },
84 { 0xd2, 4, 64, M(_SC_LEVEL3_CACHE_SIZE
), 2097152 },
85 { 0xd6, 8, 64, M(_SC_LEVEL3_CACHE_SIZE
), 1048576 },
86 { 0xd7, 8, 64, M(_SC_LEVEL3_CACHE_SIZE
), 2097152 },
87 { 0xd8, 8, 64, M(_SC_LEVEL3_CACHE_SIZE
), 4194304 },
88 { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE
), 2097152 },
89 { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE
), 4194304 },
90 { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE
), 8388608 },
91 { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE
), 2097152 },
92 { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE
), 4194304 },
93 { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE
), 8388608 },
94 { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE
), 12582912 },
95 { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE
), 18874368 },
96 { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE
), 25165824 },
99 #define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
102 intel_02_known_compare (const void *p1
, const void *p2
)
104 const struct intel_02_cache_info
*i1
;
105 const struct intel_02_cache_info
*i2
;
107 i1
= (const struct intel_02_cache_info
*) p1
;
108 i2
= (const struct intel_02_cache_info
*) p2
;
110 if (i1
->idx
== i2
->idx
)
113 return i1
->idx
< i2
->idx
? -1 : 1;
118 __attribute__ ((noinline
))
119 intel_check_word (int name
, unsigned int value
, bool *has_level_2
,
120 bool *no_level_2_or_3
,
121 const struct cpu_features
*cpu_features
)
123 if ((value
& 0x80000000) != 0)
124 /* The register value is reserved. */
127 /* Fold the name. The _SC_ constants are always in the order SIZE,
129 int folded_rel_name
= (M(name
) / 3) * 3;
133 unsigned int byte
= value
& 0xff;
137 *no_level_2_or_3
= true;
139 if (folded_rel_name
== M(_SC_LEVEL3_CACHE_SIZE
))
140 /* No need to look further. */
143 else if (byte
== 0xff)
145 /* CPUID leaf 0x4 contains all the information. We need to
152 unsigned int round
= 0;
155 __cpuid_count (4, round
, eax
, ebx
, ecx
, edx
);
157 enum { null
= 0, data
= 1, inst
= 2, uni
= 3 } type
= eax
& 0x1f;
159 /* That was the end. */
162 unsigned int level
= (eax
>> 5) & 0x7;
164 if ((level
== 1 && type
== data
165 && folded_rel_name
== M(_SC_LEVEL1_DCACHE_SIZE
))
166 || (level
== 1 && type
== inst
167 && folded_rel_name
== M(_SC_LEVEL1_ICACHE_SIZE
))
168 || (level
== 2 && folded_rel_name
== M(_SC_LEVEL2_CACHE_SIZE
))
169 || (level
== 3 && folded_rel_name
== M(_SC_LEVEL3_CACHE_SIZE
))
170 || (level
== 4 && folded_rel_name
== M(_SC_LEVEL4_CACHE_SIZE
)))
172 unsigned int offset
= M(name
) - folded_rel_name
;
176 return (((ebx
>> 22) + 1)
177 * (((ebx
>> 12) & 0x3ff) + 1)
178 * ((ebx
& 0xfff) + 1)
181 return (ebx
>> 22) + 1;
183 assert (offset
== 2);
184 return (ebx
& 0xfff) + 1;
189 /* There is no other cache information anywhere else. */
194 if (byte
== 0x49 && folded_rel_name
== M(_SC_LEVEL3_CACHE_SIZE
))
196 /* Intel reused this value. For family 15, model 6 it
197 specifies the 3rd level cache. Otherwise the 2nd
199 unsigned int family
= cpu_features
->basic
.family
;
200 unsigned int model
= cpu_features
->basic
.model
;
202 if (family
== 15 && model
== 6)
204 /* The level 3 cache is encoded for this model like
205 the level 2 cache is for other models. Pretend
206 the caller asked for the level 2 cache. */
207 name
= (_SC_LEVEL2_CACHE_SIZE
208 + (name
- _SC_LEVEL3_CACHE_SIZE
));
209 folded_rel_name
= M(_SC_LEVEL2_CACHE_SIZE
);
213 struct intel_02_cache_info
*found
;
214 struct intel_02_cache_info search
;
217 found
= bsearch (&search
, intel_02_known
, nintel_02_known
,
218 sizeof (intel_02_known
[0]), intel_02_known_compare
);
221 if (found
->rel_name
== folded_rel_name
)
223 unsigned int offset
= M(name
) - folded_rel_name
;
231 assert (offset
== 2);
232 return found
->linesize
;
235 if (found
->rel_name
== M(_SC_LEVEL2_CACHE_SIZE
))
240 /* Next byte for the next round. */
249 static long int __attribute__ ((noinline
))
250 handle_intel (int name
, const struct cpu_features
*cpu_features
)
252 unsigned int maxidx
= cpu_features
->basic
.max_cpuid
;
254 /* Return -1 for older CPUs. */
258 /* OK, we can use the CPUID instruction to get all info about the
260 unsigned int cnt
= 0;
261 unsigned int max
= 1;
263 bool no_level_2_or_3
= false;
264 bool has_level_2
= false;
272 __cpuid (2, eax
, ebx
, ecx
, edx
);
274 /* The low byte of EAX in the first round contain the number of
275 rounds we have to make. At least one, the one we are already
283 /* Process the individual registers' value. */
284 result
= intel_check_word (name
, eax
, &has_level_2
,
285 &no_level_2_or_3
, cpu_features
);
289 result
= intel_check_word (name
, ebx
, &has_level_2
,
290 &no_level_2_or_3
, cpu_features
);
294 result
= intel_check_word (name
, ecx
, &has_level_2
,
295 &no_level_2_or_3
, cpu_features
);
299 result
= intel_check_word (name
, edx
, &has_level_2
,
300 &no_level_2_or_3
, cpu_features
);
305 if (name
>= _SC_LEVEL2_CACHE_SIZE
&& name
<= _SC_LEVEL3_CACHE_LINESIZE
313 static long int __attribute__ ((noinline
))
314 handle_amd (int name
)
320 __cpuid (0x80000000, eax
, ebx
, ecx
, edx
);
322 /* No level 4 cache (yet). */
323 if (name
> _SC_LEVEL3_CACHE_LINESIZE
)
326 unsigned int fn
= 0x80000005 + (name
>= _SC_LEVEL2_CACHE_SIZE
);
330 __cpuid (fn
, eax
, ebx
, ecx
, edx
);
332 if (name
< _SC_LEVEL1_DCACHE_SIZE
)
334 name
+= _SC_LEVEL1_DCACHE_SIZE
- _SC_LEVEL1_ICACHE_SIZE
;
340 case _SC_LEVEL1_DCACHE_SIZE
:
341 return (ecx
>> 14) & 0x3fc00;
343 case _SC_LEVEL1_DCACHE_ASSOC
:
345 if ((ecx
& 0xff) == 0xff)
346 /* Fully associative. */
347 return (ecx
<< 2) & 0x3fc00;
350 case _SC_LEVEL1_DCACHE_LINESIZE
:
353 case _SC_LEVEL2_CACHE_SIZE
:
354 return (ecx
& 0xf000) == 0 ? 0 : (ecx
>> 6) & 0x3fffc00;
356 case _SC_LEVEL2_CACHE_ASSOC
:
357 switch ((ecx
>> 12) & 0xf)
363 return (ecx
>> 12) & 0xf;
379 return ((ecx
>> 6) & 0x3fffc00) / (ecx
& 0xff);
385 case _SC_LEVEL2_CACHE_LINESIZE
:
386 return (ecx
& 0xf000) == 0 ? 0 : ecx
& 0xff;
388 case _SC_LEVEL3_CACHE_SIZE
:
389 return (edx
& 0xf000) == 0 ? 0 : (edx
& 0x3ffc0000) << 1;
391 case _SC_LEVEL3_CACHE_ASSOC
:
392 switch ((edx
>> 12) & 0xf)
398 return (edx
>> 12) & 0xf;
414 return ((edx
& 0x3ffc0000) << 1) / (edx
& 0xff);
420 case _SC_LEVEL3_CACHE_LINESIZE
:
421 return (edx
& 0xf000) == 0 ? 0 : edx
& 0xff;
424 assert (! "cannot happen");
430 static long int __attribute__ ((noinline
))
431 handle_zhaoxin (int name
)
438 int folded_rel_name
= (M(name
) / 3) * 3;
440 unsigned int round
= 0;
443 __cpuid_count (4, round
, eax
, ebx
, ecx
, edx
);
445 enum { null
= 0, data
= 1, inst
= 2, uni
= 3 } type
= eax
& 0x1f;
449 unsigned int level
= (eax
>> 5) & 0x7;
451 if ((level
== 1 && type
== data
452 && folded_rel_name
== M(_SC_LEVEL1_DCACHE_SIZE
))
453 || (level
== 1 && type
== inst
454 && folded_rel_name
== M(_SC_LEVEL1_ICACHE_SIZE
))
455 || (level
== 2 && folded_rel_name
== M(_SC_LEVEL2_CACHE_SIZE
))
456 || (level
== 3 && folded_rel_name
== M(_SC_LEVEL3_CACHE_SIZE
)))
458 unsigned int offset
= M(name
) - folded_rel_name
;
462 return (((ebx
>> 22) + 1)
463 * (((ebx
>> 12) & 0x3ff) + 1)
464 * ((ebx
& 0xfff) + 1)
467 return (ebx
>> 22) + 1;
469 assert (offset
== 2);
470 return (ebx
& 0xfff) + 1;
481 get_common_cache_info (long int *shared_ptr
, unsigned int *threads_ptr
,
489 /* Number of logical processors sharing L2 cache. */
492 /* Number of logical processors sharing L3 cache. */
495 const struct cpu_features
*cpu_features
= __get_cpu_features ();
496 int max_cpuid
= cpu_features
->basic
.max_cpuid
;
497 unsigned int family
= cpu_features
->basic
.family
;
498 unsigned int model
= cpu_features
->basic
.model
;
499 long int shared
= *shared_ptr
;
500 unsigned int threads
= *threads_ptr
;
501 bool inclusive_cache
= true;
502 bool support_count_mask
= true;
505 unsigned int level
= 3;
507 if (cpu_features
->basic
.kind
== arch_kind_zhaoxin
&& family
== 6)
508 support_count_mask
= false;
512 /* Try L2 otherwise. */
524 /* A value of 0 for the HTT bit indicates there is only a single
525 logical processor. */
526 if (HAS_CPU_FEATURE (HTT
))
528 /* Figure out the number of logical threads that share the
529 highest cache level. */
534 /* Query until cache level 2 and 3 are enumerated. */
535 int check
= 0x1 | (threads_l3
== 0) << 1;
538 __cpuid_count (4, i
++, eax
, ebx
, ecx
, edx
);
540 /* There seems to be a bug in at least some Pentium Ds
541 which sometimes fail to iterate all cache parameters.
542 Do not loop indefinitely here, stop in this case and
543 assume there is no such information. */
544 if (cpu_features
->basic
.kind
== arch_kind_intel
545 && (eax
& 0x1f) == 0 )
546 goto intel_bug_no_cache_info
;
548 switch ((eax
>> 5) & 0x7)
555 /* Get maximum number of logical processors
557 threads_l2
= (eax
>> 14) & 0x3ff;
562 if ((check
& (0x1 << 1)))
564 /* Get maximum number of logical processors
566 threads_l3
= (eax
>> 14) & 0x3ff;
568 /* Check if L2 and L3 caches are inclusive. */
569 inclusive_cache
= (edx
& 0x2) != 0;
570 check
&= ~(0x1 << 1);
577 /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
578 numbers of addressable IDs for logical processors sharing
579 the cache, instead of the maximum number of threads
580 sharing the cache. */
581 if (max_cpuid
>= 11 && support_count_mask
)
583 /* Find the number of logical processors shipped in
584 one core and apply count mask. */
587 /* Count SMT only if there is L3 cache. Always count
588 core if there is no L3 cache. */
589 int count
= ((threads_l2
> 0 && level
== 3)
591 || (threads_l2
> 0 && level
== 2)) << 1));
595 __cpuid_count (11, i
++, eax
, ebx
, ecx
, edx
);
597 int shipped
= ebx
& 0xff;
598 int type
= ecx
& 0xff00;
599 if (shipped
== 0 || type
== 0)
601 else if (type
== 0x100)
608 /* Compute count mask. */
610 : "=r" (count_mask
) : "g" (threads_l2
));
611 count_mask
= ~(-1 << (count_mask
+ 1));
612 threads_l2
= (shipped
- 1) & count_mask
;
616 else if (type
== 0x200)
619 if ((count
& (0x1 << 1)))
623 = (level
== 2 ? threads_l2
: threads_l3
);
625 /* Compute count mask. */
627 : "=r" (count_mask
) : "g" (threads_core
));
628 count_mask
= ~(-1 << (count_mask
+ 1));
629 threads_core
= (shipped
- 1) & count_mask
;
631 threads_l2
= threads_core
;
633 threads_l3
= threads_core
;
634 count
&= ~(0x1 << 1);
647 threads
= threads_l2
;
648 if (cpu_features
->basic
.kind
== arch_kind_intel
658 /* Silvermont has L2 cache shared by 2 cores. */
667 threads
= threads_l3
;
671 intel_bug_no_cache_info
:
672 /* Assume that all logical threads share the highest cache
675 = ((cpu_features
->features
[CPUID_INDEX_1
].cpuid
.ebx
>> 16)
679 /* Cap usage of highest cache level to the number of supported
681 if (shared
> 0 && threads
> 0)
685 /* Account for non-inclusive L2 and L3 caches. */
686 if (!inclusive_cache
)
693 *shared_ptr
= shared
;
694 *threads_ptr
= threads
;
698 dl_init_cacheinfo (struct cpu_features
*cpu_features
)
700 /* Find out what brand of processor. */
706 long int shared
= -1;
708 unsigned int threads
= 0;
709 unsigned long int level1_icache_size
= -1;
710 unsigned long int level1_icache_linesize
= -1;
711 unsigned long int level1_dcache_size
= -1;
712 unsigned long int level1_dcache_assoc
= -1;
713 unsigned long int level1_dcache_linesize
= -1;
714 unsigned long int level2_cache_size
= -1;
715 unsigned long int level2_cache_assoc
= -1;
716 unsigned long int level2_cache_linesize
= -1;
717 unsigned long int level3_cache_size
= -1;
718 unsigned long int level3_cache_assoc
= -1;
719 unsigned long int level3_cache_linesize
= -1;
720 unsigned long int level4_cache_size
= -1;
722 if (cpu_features
->basic
.kind
== arch_kind_intel
)
724 data
= handle_intel (_SC_LEVEL1_DCACHE_SIZE
, cpu_features
);
725 core
= handle_intel (_SC_LEVEL2_CACHE_SIZE
, cpu_features
);
726 shared
= handle_intel (_SC_LEVEL3_CACHE_SIZE
, cpu_features
);
729 = handle_intel (_SC_LEVEL1_ICACHE_SIZE
, cpu_features
);
730 level1_icache_linesize
731 = handle_intel (_SC_LEVEL1_ICACHE_LINESIZE
, cpu_features
);
732 level1_dcache_size
= data
;
734 = handle_intel (_SC_LEVEL1_DCACHE_ASSOC
, cpu_features
);
735 level1_dcache_linesize
736 = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE
, cpu_features
);
737 level2_cache_size
= core
;
739 = handle_intel (_SC_LEVEL2_CACHE_ASSOC
, cpu_features
);
740 level2_cache_linesize
741 = handle_intel (_SC_LEVEL2_CACHE_LINESIZE
, cpu_features
);
742 level3_cache_size
= shared
;
744 = handle_intel (_SC_LEVEL3_CACHE_ASSOC
, cpu_features
);
745 level3_cache_linesize
746 = handle_intel (_SC_LEVEL3_CACHE_LINESIZE
, cpu_features
);
748 = handle_intel (_SC_LEVEL4_CACHE_SIZE
, cpu_features
);
750 get_common_cache_info (&shared
, &threads
, core
);
752 else if (cpu_features
->basic
.kind
== arch_kind_zhaoxin
)
754 data
= handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE
);
755 core
= handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE
);
756 shared
= handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE
);
758 level1_icache_size
= handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE
);
759 level1_icache_linesize
= handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE
);
760 level1_dcache_size
= data
;
761 level1_dcache_assoc
= handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC
);
762 level1_dcache_linesize
= handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE
);
763 level2_cache_size
= core
;
764 level2_cache_assoc
= handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC
);
765 level2_cache_linesize
= handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE
);
766 level3_cache_size
= shared
;
767 level3_cache_assoc
= handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC
);
768 level3_cache_linesize
= handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE
);
770 get_common_cache_info (&shared
, &threads
, core
);
772 else if (cpu_features
->basic
.kind
== arch_kind_amd
)
774 data
= handle_amd (_SC_LEVEL1_DCACHE_SIZE
);
775 core
= handle_amd (_SC_LEVEL2_CACHE_SIZE
);
776 shared
= handle_amd (_SC_LEVEL3_CACHE_SIZE
);
778 level1_icache_size
= handle_amd (_SC_LEVEL1_ICACHE_SIZE
);
779 level1_icache_linesize
= handle_amd (_SC_LEVEL1_ICACHE_LINESIZE
);
780 level1_dcache_size
= data
;
781 level1_dcache_assoc
= handle_amd (_SC_LEVEL1_DCACHE_ASSOC
);
782 level1_dcache_linesize
= handle_amd (_SC_LEVEL1_DCACHE_LINESIZE
);
783 level2_cache_size
= core
;
784 level2_cache_assoc
= handle_amd (_SC_LEVEL2_CACHE_ASSOC
);
785 level2_cache_linesize
= handle_amd (_SC_LEVEL2_CACHE_LINESIZE
);
786 level3_cache_size
= shared
;
787 level3_cache_assoc
= handle_amd (_SC_LEVEL3_CACHE_ASSOC
);
788 level3_cache_linesize
= handle_amd (_SC_LEVEL3_CACHE_LINESIZE
);
790 /* Get maximum extended function. */
791 __cpuid (0x80000000, max_cpuid_ex
, ebx
, ecx
, edx
);
794 /* No shared L3 cache. All we have is the L2 cache. */
798 /* Figure out the number of logical threads that share L3. */
799 if (max_cpuid_ex
>= 0x80000008)
801 /* Get width of APIC ID. */
802 __cpuid (0x80000008, max_cpuid_ex
, ebx
, ecx
, edx
);
803 threads
= 1 << ((ecx
>> 12) & 0x0f);
806 if (threads
== 0 || cpu_features
->basic
.family
>= 0x17)
808 /* If APIC ID width is not available, use logical
810 __cpuid (0x00000001, max_cpuid_ex
, ebx
, ecx
, edx
);
812 if ((edx
& (1 << 28)) != 0)
813 threads
= (ebx
>> 16) & 0xff;
816 /* Cap usage of highest cache level to the number of
817 supported threads. */
821 /* Get shared cache per ccx for Zen architectures. */
822 if (cpu_features
->basic
.family
>= 0x17)
826 /* Get number of threads share the L3 cache in CCX. */
827 __cpuid_count (0x8000001D, 0x3, eax
, ebx
, ecx
, edx
);
829 unsigned int threads_per_ccx
= ((eax
>> 14) & 0xfff) + 1;
830 shared
*= threads_per_ccx
;
834 /* Account for exclusive L2 and L3 caches. */
840 cpu_features
->level1_icache_size
= level1_icache_size
;
841 cpu_features
->level1_icache_linesize
= level1_icache_linesize
;
842 cpu_features
->level1_dcache_size
= level1_dcache_size
;
843 cpu_features
->level1_dcache_assoc
= level1_dcache_assoc
;
844 cpu_features
->level1_dcache_linesize
= level1_dcache_linesize
;
845 cpu_features
->level2_cache_size
= level2_cache_size
;
846 cpu_features
->level2_cache_assoc
= level2_cache_assoc
;
847 cpu_features
->level2_cache_linesize
= level2_cache_linesize
;
848 cpu_features
->level3_cache_size
= level3_cache_size
;
849 cpu_features
->level3_cache_assoc
= level3_cache_assoc
;
850 cpu_features
->level3_cache_linesize
= level3_cache_linesize
;
851 cpu_features
->level4_cache_size
= level4_cache_size
;
853 /* The default setting for the non_temporal threshold is 3/4 of one
854 thread's share of the chip's cache. For most Intel and AMD processors
855 with an initial release date between 2017 and 2020, a thread's typical
856 share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
857 threshold leaves 125 KBytes to 500 KBytes of the thread's data
858 in cache after a maximum temporal copy, which will maintain
859 in cache a reasonable portion of the thread's stack and other
860 active data. If the threshold is set higher than one thread's
861 share of the cache, it has a substantial risk of negatively
862 impacting the performance of other threads running on the chip. */
863 unsigned long int non_temporal_threshold
= shared
* 3 / 4;
864 /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
865 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
866 if that operation cannot overflow. Minimum of 0x4040 (16448) because the
867 L(large_memset_4x) loops need 64-byte to cache align and enough space for
868 at least 1 iteration of 4x PAGE_SIZE unrolled loop. Both values are
869 reflected in the manual. */
870 unsigned long int maximum_non_temporal_threshold
= SIZE_MAX
>> 4;
871 unsigned long int minimum_non_temporal_threshold
= 0x4040;
872 if (non_temporal_threshold
< minimum_non_temporal_threshold
)
873 non_temporal_threshold
= minimum_non_temporal_threshold
;
874 else if (non_temporal_threshold
> maximum_non_temporal_threshold
)
875 non_temporal_threshold
= maximum_non_temporal_threshold
;
878 /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
879 unsigned int minimum_rep_movsb_threshold
;
881 /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
882 VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
883 threshold is 2048 * (VEC_SIZE / 16). */
884 unsigned int rep_movsb_threshold
;
885 if (CPU_FEATURE_USABLE_P (cpu_features
, AVX512F
)
886 && !CPU_FEATURE_PREFERRED_P (cpu_features
, Prefer_No_AVX512
))
888 rep_movsb_threshold
= 4096 * (64 / 16);
890 minimum_rep_movsb_threshold
= 64 * 8;
893 else if (CPU_FEATURE_PREFERRED_P (cpu_features
,
894 AVX_Fast_Unaligned_Load
))
896 rep_movsb_threshold
= 4096 * (32 / 16);
898 minimum_rep_movsb_threshold
= 32 * 8;
903 rep_movsb_threshold
= 2048 * (16 / 16);
905 minimum_rep_movsb_threshold
= 16 * 8;
908 /* NB: The default REP MOVSB threshold is 2112 on processors with fast
909 short REP MOVSB (FSRM). */
910 if (CPU_FEATURE_USABLE_P (cpu_features
, FSRM
))
911 rep_movsb_threshold
= 2112;
913 /* The default threshold to use Enhanced REP STOSB. */
914 unsigned long int rep_stosb_threshold
= 2048;
917 long int tunable_size
;
919 tunable_size
= TUNABLE_GET (x86_data_cache_size
, long int, NULL
);
920 /* NB: Ignore the default value 0. */
921 if (tunable_size
!= 0)
924 tunable_size
= TUNABLE_GET (x86_shared_cache_size
, long int, NULL
);
925 /* NB: Ignore the default value 0. */
926 if (tunable_size
!= 0)
927 shared
= tunable_size
;
929 tunable_size
= TUNABLE_GET (x86_non_temporal_threshold
, long int, NULL
);
930 if (tunable_size
> minimum_non_temporal_threshold
931 && tunable_size
<= maximum_non_temporal_threshold
)
932 non_temporal_threshold
= tunable_size
;
934 tunable_size
= TUNABLE_GET (x86_rep_movsb_threshold
, long int, NULL
);
935 if (tunable_size
> minimum_rep_movsb_threshold
)
936 rep_movsb_threshold
= tunable_size
;
938 /* NB: The default value of the x86_rep_stosb_threshold tunable is the
939 same as the default value of __x86_rep_stosb_threshold and the
940 minimum value is fixed. */
941 rep_stosb_threshold
= TUNABLE_GET (x86_rep_stosb_threshold
,
944 TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size
, data
, 0, SIZE_MAX
);
945 TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size
, shared
, 0, SIZE_MAX
);
946 TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold
, non_temporal_threshold
,
947 minimum_non_temporal_threshold
,
948 maximum_non_temporal_threshold
);
949 TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold
, rep_movsb_threshold
,
950 minimum_rep_movsb_threshold
, SIZE_MAX
);
951 TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold
, rep_stosb_threshold
, 1,
955 unsigned long int rep_movsb_stop_threshold
;
956 /* ERMS feature is implemented from AMD Zen3 architecture and it is
957 performing poorly for data above L2 cache size. Henceforth, adding
958 an upper bound threshold parameter to limit the usage of Enhanced
959 REP MOVSB operations and setting its value to L2 cache size. */
960 if (cpu_features
->basic
.kind
== arch_kind_amd
)
961 rep_movsb_stop_threshold
= core
;
962 /* Setting the upper bound of ERMS to the computed value of
963 non-temporal threshold for architectures other than AMD. */
965 rep_movsb_stop_threshold
= non_temporal_threshold
;
967 cpu_features
->data_cache_size
= data
;
968 cpu_features
->shared_cache_size
= shared
;
969 cpu_features
->non_temporal_threshold
= non_temporal_threshold
;
970 cpu_features
->rep_movsb_threshold
= rep_movsb_threshold
;
971 cpu_features
->rep_stosb_threshold
= rep_stosb_threshold
;
972 cpu_features
->rep_movsb_stop_threshold
= rep_movsb_stop_threshold
;