-
Notifications
You must be signed in to change notification settings - Fork 3
Description
Current assumption according to the comments is "The L3 cache (Last-Level Cache or LLC in many contexts) is typically shared among all cores within the same physical CPU socket" is very false in practice. It is only really true on small consumer CPUs.
Here is what things look like on AMD Threadripper 7970X:
Machine (125GB total) cpuset=0xffffffff,0xffffffff
Package L#0 cpuset=0xffffffff,0xffffffff
Die L#0 cpuset=0x000000ff,0x000000ff
NUMANode(HBM) L#0 (P#0 31GB) cpuset=0x000000ff,0x000000ff
L3 L#0 (32MB) cpuset=0x000000ff,0x000000ff
L2 L#0 (1024KB) cpuset=0x00000001,0x00000001
L1d L#0 (32KB) cpuset=0x00000001,0x00000001
L1i L#0 (32KB) cpuset=0x00000001,0x00000001
Core L#0 cpuset=0x00000001,0x00000001
PU L#0 (P#0) cpuset=0x00000001
PU L#1 (P#32) cpuset=0x00000001,0x0
L2 L#1 (1024KB) cpuset=0x00000002,0x00000002
L1d L#1 (32KB) cpuset=0x00000002,0x00000002
L1i L#1 (32KB) cpuset=0x00000002,0x00000002
Core L#1 cpuset=0x00000002,0x00000002
PU L#2 (P#1) cpuset=0x00000002
PU L#3 (P#33) cpuset=0x00000002,0x0
L2 L#2 (1024KB) cpuset=0x00000004,0x00000004
L1d L#2 (32KB) cpuset=0x00000004,0x00000004
L1i L#2 (32KB) cpuset=0x00000004,0x00000004
Core L#2 cpuset=0x00000004,0x00000004
PU L#4 (P#2) cpuset=0x00000004
PU L#5 (P#34) cpuset=0x00000004,0x0
L2 L#3 (1024KB) cpuset=0x00000008,0x00000008
L1d L#3 (32KB) cpuset=0x00000008,0x00000008
L1i L#3 (32KB) cpuset=0x00000008,0x00000008
Core L#3 cpuset=0x00000008,0x00000008
PU L#6 (P#3) cpuset=0x00000008
PU L#7 (P#35) cpuset=0x00000008,0x0
L2 L#4 (1024KB) cpuset=0x00000010,0x00000010
L1d L#4 (32KB) cpuset=0x00000010,0x00000010
L1i L#4 (32KB) cpuset=0x00000010,0x00000010
Core L#4 cpuset=0x00000010,0x00000010
PU L#8 (P#4) cpuset=0x00000010
PU L#9 (P#36) cpuset=0x00000010,0x0
L2 L#5 (1024KB) cpuset=0x00000020,0x00000020
L1d L#5 (32KB) cpuset=0x00000020,0x00000020
L1i L#5 (32KB) cpuset=0x00000020,0x00000020
Core L#5 cpuset=0x00000020,0x00000020
PU L#10 (P#5) cpuset=0x00000020
PU L#11 (P#37) cpuset=0x00000020,0x0
L2 L#6 (1024KB) cpuset=0x00000040,0x00000040
L1d L#6 (32KB) cpuset=0x00000040,0x00000040
L1i L#6 (32KB) cpuset=0x00000040,0x00000040
Core L#6 cpuset=0x00000040,0x00000040
PU L#12 (P#6) cpuset=0x00000040
PU L#13 (P#38) cpuset=0x00000040,0x0
L2 L#7 (1024KB) cpuset=0x00000080,0x00000080
L1d L#7 (32KB) cpuset=0x00000080,0x00000080
L1i L#7 (32KB) cpuset=0x00000080,0x00000080
Core L#7 cpuset=0x00000080,0x00000080
PU L#14 (P#7) cpuset=0x00000080
PU L#15 (P#39) cpuset=0x00000080,0x0
Die L#1 cpuset=0x0000ff00,0x0000ff00
NUMANode(DRAM) L#1 (P#3 31GB) cpuset=0x0000ff00,0x0000ff00
L3 L#1 (32MB) cpuset=0x0000ff00,0x0000ff00
L2 L#8 (1024KB) cpuset=0x00000100,0x00000100
L1d L#8 (32KB) cpuset=0x00000100,0x00000100
L1i L#8 (32KB) cpuset=0x00000100,0x00000100
Core L#8 cpuset=0x00000100,0x00000100
PU L#16 (P#8) cpuset=0x00000100
PU L#17 (P#40) cpuset=0x00000100,0x0
L2 L#9 (1024KB) cpuset=0x00000200,0x00000200
L1d L#9 (32KB) cpuset=0x00000200,0x00000200
L1i L#9 (32KB) cpuset=0x00000200,0x00000200
Core L#9 cpuset=0x00000200,0x00000200
PU L#18 (P#9) cpuset=0x00000200
PU L#19 (P#41) cpuset=0x00000200,0x0
L2 L#10 (1024KB) cpuset=0x00000400,0x00000400
L1d L#10 (32KB) cpuset=0x00000400,0x00000400
L1i L#10 (32KB) cpuset=0x00000400,0x00000400
Core L#10 cpuset=0x00000400,0x00000400
PU L#20 (P#10) cpuset=0x00000400
PU L#21 (P#42) cpuset=0x00000400,0x0
L2 L#11 (1024KB) cpuset=0x00000800,0x00000800
L1d L#11 (32KB) cpuset=0x00000800,0x00000800
L1i L#11 (32KB) cpuset=0x00000800,0x00000800
Core L#11 cpuset=0x00000800,0x00000800
PU L#22 (P#11) cpuset=0x00000800
PU L#23 (P#43) cpuset=0x00000800,0x0
L2 L#12 (1024KB) cpuset=0x00001000,0x00001000
L1d L#12 (32KB) cpuset=0x00001000,0x00001000
L1i L#12 (32KB) cpuset=0x00001000,0x00001000
Core L#12 cpuset=0x00001000,0x00001000
PU L#24 (P#12) cpuset=0x00001000
PU L#25 (P#44) cpuset=0x00001000,0x0
L2 L#13 (1024KB) cpuset=0x00002000,0x00002000
L1d L#13 (32KB) cpuset=0x00002000,0x00002000
L1i L#13 (32KB) cpuset=0x00002000,0x00002000
Core L#13 cpuset=0x00002000,0x00002000
PU L#26 (P#13) cpuset=0x00002000
PU L#27 (P#45) cpuset=0x00002000,0x0
L2 L#14 (1024KB) cpuset=0x00004000,0x00004000
L1d L#14 (32KB) cpuset=0x00004000,0x00004000
L1i L#14 (32KB) cpuset=0x00004000,0x00004000
Core L#14 cpuset=0x00004000,0x00004000
PU L#28 (P#14) cpuset=0x00004000
PU L#29 (P#46) cpuset=0x00004000,0x0
L2 L#15 (1024KB) cpuset=0x00008000,0x00008000
L1d L#15 (32KB) cpuset=0x00008000,0x00008000
L1i L#15 (32KB) cpuset=0x00008000,0x00008000
Core L#15 cpuset=0x00008000,0x00008000
PU L#30 (P#15) cpuset=0x00008000
PU L#31 (P#47) cpuset=0x00008000,0x0
Die L#2 cpuset=0x00ff0000,0x00ff0000
NUMANode(DRAM) L#2 (P#1 31GB) cpuset=0x00ff0000,0x00ff0000
L3 L#2 (32MB) cpuset=0x00ff0000,0x00ff0000
L2 L#16 (1024KB) cpuset=0x00010000,0x00010000
L1d L#16 (32KB) cpuset=0x00010000,0x00010000
L1i L#16 (32KB) cpuset=0x00010000,0x00010000
Core L#16 cpuset=0x00010000,0x00010000
PU L#32 (P#16) cpuset=0x00010000
PU L#33 (P#48) cpuset=0x00010000,0x0
L2 L#17 (1024KB) cpuset=0x00020000,0x00020000
L1d L#17 (32KB) cpuset=0x00020000,0x00020000
L1i L#17 (32KB) cpuset=0x00020000,0x00020000
Core L#17 cpuset=0x00020000,0x00020000
PU L#34 (P#17) cpuset=0x00020000
PU L#35 (P#49) cpuset=0x00020000,0x0
L2 L#18 (1024KB) cpuset=0x00040000,0x00040000
L1d L#18 (32KB) cpuset=0x00040000,0x00040000
L1i L#18 (32KB) cpuset=0x00040000,0x00040000
Core L#18 cpuset=0x00040000,0x00040000
PU L#36 (P#18) cpuset=0x00040000
PU L#37 (P#50) cpuset=0x00040000,0x0
L2 L#19 (1024KB) cpuset=0x00080000,0x00080000
L1d L#19 (32KB) cpuset=0x00080000,0x00080000
L1i L#19 (32KB) cpuset=0x00080000,0x00080000
Core L#19 cpuset=0x00080000,0x00080000
PU L#38 (P#19) cpuset=0x00080000
PU L#39 (P#51) cpuset=0x00080000,0x0
L2 L#20 (1024KB) cpuset=0x00100000,0x00100000
L1d L#20 (32KB) cpuset=0x00100000,0x00100000
L1i L#20 (32KB) cpuset=0x00100000,0x00100000
Core L#20 cpuset=0x00100000,0x00100000
PU L#40 (P#20) cpuset=0x00100000
PU L#41 (P#52) cpuset=0x00100000,0x0
L2 L#21 (1024KB) cpuset=0x00200000,0x00200000
L1d L#21 (32KB) cpuset=0x00200000,0x00200000
L1i L#21 (32KB) cpuset=0x00200000,0x00200000
Core L#21 cpuset=0x00200000,0x00200000
PU L#42 (P#21) cpuset=0x00200000
PU L#43 (P#53) cpuset=0x00200000,0x0
L2 L#22 (1024KB) cpuset=0x00400000,0x00400000
L1d L#22 (32KB) cpuset=0x00400000,0x00400000
L1i L#22 (32KB) cpuset=0x00400000,0x00400000
Core L#22 cpuset=0x00400000,0x00400000
PU L#44 (P#22) cpuset=0x00400000
PU L#45 (P#54) cpuset=0x00400000,0x0
L2 L#23 (1024KB) cpuset=0x00800000,0x00800000
L1d L#23 (32KB) cpuset=0x00800000,0x00800000
L1i L#23 (32KB) cpuset=0x00800000,0x00800000
Core L#23 cpuset=0x00800000,0x00800000
PU L#46 (P#23) cpuset=0x00800000
PU L#47 (P#55) cpuset=0x00800000,0x0
Die L#3 cpuset=0xff000000,0xff000000
NUMANode(DRAM) L#3 (P#2 31GB) cpuset=0xff000000,0xff000000
L3 L#3 (32MB) cpuset=0xff000000,0xff000000
L2 L#24 (1024KB) cpuset=0x01000000,0x01000000
L1d L#24 (32KB) cpuset=0x01000000,0x01000000
L1i L#24 (32KB) cpuset=0x01000000,0x01000000
Core L#24 cpuset=0x01000000,0x01000000
PU L#48 (P#24) cpuset=0x01000000
PU L#49 (P#56) cpuset=0x01000000,0x0
L2 L#25 (1024KB) cpuset=0x02000000,0x02000000
L1d L#25 (32KB) cpuset=0x02000000,0x02000000
L1i L#25 (32KB) cpuset=0x02000000,0x02000000
Core L#25 cpuset=0x02000000,0x02000000
PU L#50 (P#25) cpuset=0x02000000
PU L#51 (P#57) cpuset=0x02000000,0x0
L2 L#26 (1024KB) cpuset=0x04000000,0x04000000
L1d L#26 (32KB) cpuset=0x04000000,0x04000000
L1i L#26 (32KB) cpuset=0x04000000,0x04000000
Core L#26 cpuset=0x04000000,0x04000000
PU L#52 (P#26) cpuset=0x04000000
PU L#53 (P#58) cpuset=0x04000000,0x0
L2 L#27 (1024KB) cpuset=0x08000000,0x08000000
L1d L#27 (32KB) cpuset=0x08000000,0x08000000
L1i L#27 (32KB) cpuset=0x08000000,0x08000000
Core L#27 cpuset=0x08000000,0x08000000
PU L#54 (P#27) cpuset=0x08000000
PU L#55 (P#59) cpuset=0x08000000,0x0
L2 L#28 (1024KB) cpuset=0x10000000,0x10000000
L1d L#28 (32KB) cpuset=0x10000000,0x10000000
L1i L#28 (32KB) cpuset=0x10000000,0x10000000
Core L#28 cpuset=0x10000000,0x10000000
PU L#56 (P#28) cpuset=0x10000000
PU L#57 (P#60) cpuset=0x10000000,0x0
L2 L#29 (1024KB) cpuset=0x20000000,0x20000000
L1d L#29 (32KB) cpuset=0x20000000,0x20000000
L1i L#29 (32KB) cpuset=0x20000000,0x20000000
Core L#29 cpuset=0x20000000,0x20000000
PU L#58 (P#29) cpuset=0x20000000
PU L#59 (P#61) cpuset=0x20000000,0x0
L2 L#30 (1024KB) cpuset=0x40000000,0x40000000
L1d L#30 (32KB) cpuset=0x40000000,0x40000000
L1i L#30 (32KB) cpuset=0x40000000,0x40000000
Core L#30 cpuset=0x40000000,0x40000000
PU L#60 (P#30) cpuset=0x40000000
PU L#61 (P#62) cpuset=0x40000000,0x0
L2 L#31 (1024KB) cpuset=0x80000000,0x80000000
L1d L#31 (32KB) cpuset=0x80000000,0x80000000
L1i L#31 (32KB) cpuset=0x80000000,0x80000000
Core L#31 cpuset=0x80000000,0x80000000
PU L#62 (P#31) cpuset=0x80000000
PU L#63 (P#63) cpuset=0x80000000,0x0
And here is AMD Epyc 7302P:
Machine (126GB total) cpuset=0xffffffff
Package L#0 cpuset=0xffffffff
Group0 L#0 cpuset=0x000f000f
NUMANode L#0 (P#0 31GB) cpuset=0x000f000f
L3 L#0 (16MB) cpuset=0x00030003
L2 L#0 (512KB) cpuset=0x00010001
L1d L#0 (32KB) cpuset=0x00010001
L1i L#0 (32KB) cpuset=0x00010001
Core L#0 cpuset=0x00010001
PU L#0 (P#0) cpuset=0x00000001
PU L#1 (P#16) cpuset=0x00010000
L2 L#1 (512KB) cpuset=0x00020002
L1d L#1 (32KB) cpuset=0x00020002
L1i L#1 (32KB) cpuset=0x00020002
Core L#1 cpuset=0x00020002
PU L#2 (P#1) cpuset=0x00000002
PU L#3 (P#17) cpuset=0x00020000
L3 L#1 (16MB) cpuset=0x000c000c
L2 L#2 (512KB) cpuset=0x00040004
L1d L#2 (32KB) cpuset=0x00040004
L1i L#2 (32KB) cpuset=0x00040004
Core L#2 cpuset=0x00040004
PU L#4 (P#2) cpuset=0x00000004
PU L#5 (P#18) cpuset=0x00040000
L2 L#3 (512KB) cpuset=0x00080008
L1d L#3 (32KB) cpuset=0x00080008
L1i L#3 (32KB) cpuset=0x00080008
Core L#3 cpuset=0x00080008
PU L#6 (P#3) cpuset=0x00000008
PU L#7 (P#19) cpuset=0x00080000
Group0 L#1 cpuset=0x00f000f0
NUMANode L#1 (P#1 31GB) cpuset=0x00f000f0
L3 L#2 (16MB) cpuset=0x00300030
L2 L#4 (512KB) cpuset=0x00100010
L1d L#4 (32KB) cpuset=0x00100010
L1i L#4 (32KB) cpuset=0x00100010
Core L#4 cpuset=0x00100010
PU L#8 (P#4) cpuset=0x00000010
PU L#9 (P#20) cpuset=0x00100000
L2 L#5 (512KB) cpuset=0x00200020
L1d L#5 (32KB) cpuset=0x00200020
L1i L#5 (32KB) cpuset=0x00200020
Core L#5 cpuset=0x00200020
PU L#10 (P#5) cpuset=0x00000020
PU L#11 (P#21) cpuset=0x00200000
L3 L#3 (16MB) cpuset=0x00c000c0
L2 L#6 (512KB) cpuset=0x00400040
L1d L#6 (32KB) cpuset=0x00400040
L1i L#6 (32KB) cpuset=0x00400040
Core L#6 cpuset=0x00400040
PU L#12 (P#6) cpuset=0x00000040
PU L#13 (P#22) cpuset=0x00400000
L2 L#7 (512KB) cpuset=0x00800080
L1d L#7 (32KB) cpuset=0x00800080
L1i L#7 (32KB) cpuset=0x00800080
Core L#7 cpuset=0x00800080
PU L#14 (P#7) cpuset=0x00000080
PU L#15 (P#23) cpuset=0x00800000
Group0 L#2 cpuset=0x0f000f00
NUMANode L#2 (P#2 31GB) cpuset=0x0f000f00
L3 L#4 (16MB) cpuset=0x03000300
L2 L#8 (512KB) cpuset=0x01000100
L1d L#8 (32KB) cpuset=0x01000100
L1i L#8 (32KB) cpuset=0x01000100
Core L#8 cpuset=0x01000100
PU L#16 (P#8) cpuset=0x00000100
PU L#17 (P#24) cpuset=0x01000000
L2 L#9 (512KB) cpuset=0x02000200
L1d L#9 (32KB) cpuset=0x02000200
L1i L#9 (32KB) cpuset=0x02000200
Core L#9 cpuset=0x02000200
PU L#18 (P#9) cpuset=0x00000200
PU L#19 (P#25) cpuset=0x02000000
L3 L#5 (16MB) cpuset=0x0c000c00
L2 L#10 (512KB) cpuset=0x04000400
L1d L#10 (32KB) cpuset=0x04000400
L1i L#10 (32KB) cpuset=0x04000400
Core L#10 cpuset=0x04000400
PU L#20 (P#10) cpuset=0x00000400
PU L#21 (P#26) cpuset=0x04000000
L2 L#11 (512KB) cpuset=0x08000800
L1d L#11 (32KB) cpuset=0x08000800
L1i L#11 (32KB) cpuset=0x08000800
Core L#11 cpuset=0x08000800
PU L#22 (P#11) cpuset=0x00000800
PU L#23 (P#27) cpuset=0x08000000
Group0 L#3 cpuset=0xf000f000
NUMANode L#3 (P#3 31GB) cpuset=0xf000f000
L3 L#6 (16MB) cpuset=0x30003000
L2 L#12 (512KB) cpuset=0x10001000
L1d L#12 (32KB) cpuset=0x10001000
L1i L#12 (32KB) cpuset=0x10001000
Core L#12 cpuset=0x10001000
PU L#24 (P#12) cpuset=0x00001000
PU L#25 (P#28) cpuset=0x10000000
L2 L#13 (512KB) cpuset=0x20002000
L1d L#13 (32KB) cpuset=0x20002000
L1i L#13 (32KB) cpuset=0x20002000
Core L#13 cpuset=0x20002000
PU L#26 (P#13) cpuset=0x00002000
PU L#27 (P#29) cpuset=0x20000000
L3 L#7 (16MB) cpuset=0xc000c000
L2 L#14 (512KB) cpuset=0x40004000
L1d L#14 (32KB) cpuset=0x40004000
L1i L#14 (32KB) cpuset=0x40004000
Core L#14 cpuset=0x40004000
PU L#28 (P#14) cpuset=0x00004000
PU L#29 (P#30) cpuset=0x40000000
L2 L#15 (512KB) cpuset=0x80008000
L1d L#15 (32KB) cpuset=0x80008000
L1i L#15 (32KB) cpuset=0x80008000
Core L#15 cpuset=0x80008000
PU L#30 (P#15) cpuset=0x00008000
PU L#31 (P#31) cpuset=0x80000000
Both systems have a single socket and configured to NPS4 (the default BIOS configuration presents both as a single NUMA node instead, but from L3 caches you can clearly discern that they belong to different physical dies). This is not specific to workstation/server CPUs either, many consumer CPUs these days have chiplet design and present themselves as such that have multiple L3 caches.
The best way to manage cores properly regardless of BIOS configuration from my experience is to look at how cores are grouped by L3 cache since latency between cores that do not share L3 is significantly higher.
There is no such API in the library right now, with https://github.com/HadrienG2/hwlocality it looks something like this:
let topology = hwlocality::Topology::new().unwrap();
let cpu_cores = topology
// Iterate over groups of L3 caches
.objects_with_type(hwlocality::object::types::ObjectType::L3Cache)
// For each L3 cache get CPU set
.filter_map(|node| node.cpuset())
// For each CPU set extract individual cores
.map(|cpuset| cpuset.iter_set().map(usize::from).collect::<Vec<_>>())
.filter(|cores| !cores.is_empty())
.collect::<Vec<_>>();BTW I'd appreciate a similar API where no implicit static variable is used.