I encountered a trouble during getting L3 cache info on Intel processors. Getting L3 line length on AMD is simple, like this:
mov eax, 0x80000006
cpuid
shl edx, 24
shr edx, 24
The same operation on Intels is much more complicated. I got that this might be done using this sequence:
mov eax, 2
cpuid
and pasring register values by this manual: http://www.microbe.cz/docs/CPUID.pdf (page 26, "Table 2-7. Descriptor Decode Values").
But my program does not found any of enumerated descriptors and returns 0 for cache size and line length.
Is there any simpler and/or sufficient method to get cache size and line length on Intels?
Here is the full code. All cpuid output (eax, ebx, ecx, edx) is pushed onto stack, then each value is compared with hardcoded descriptors list. Comparation is made on lower 8 bits, then these bits are shrinked.
__declspec(dllexport) __declspec(naked) void GetMetricLevel2(int &length) {
__asm {
// check CPUID availability
pushfd
pop eax
mov ebx, eax
xor eax, 00200000h
push eax
popfd
pushfd
pop eax
cmp eax, ebx
jnz HAS_CPUID
mov edx, -1 // return -1 by reference
jmp RET_ARG
HAS_CPUID:
mov eax, 2 // L3 Intel, incomplete
mov ecx, 0
cpuid
push ecx
or ecx, eax
or ecx, ebx
or ecx, edx
cmp ecx, 0
pop ecx // experimental
je CPU_AMD // if all registers are 0, we try AMD scheme
CPU_INTEL:
push ebp
mov ebp, 0
push 0
push eax // store counter
jmp CALL_BEGIN
CYCLE_BEGIN:
pop ecx
inc ecx
push ecx
push eax
mov eax, 2
cpuid
CALL_BEGIN:
push eax
push ebx
push ecx
push edx
mov ch, 4
PARSE_REG:
pop edx
mov cl, 4
PARSE_DESCR:
DD0H://512,4w
cmp dl, 0xD0
jne DD1H
add ebp, 512d
jmp MISS_L3CACHE
DD1H://1024,4w
cmp dl, 0xD1
jne DD2H
add ebp, 1024d
jmp MISS_L3CACHE
DD2H://2048,4w
cmp dl, 0xD2
jne DD6H
add ebp, 2048d
jmp MISS_L3CACHE
DD6H://1024,8w
cmp dl, 0xD6
jne DD7H
add ebp, 1024d
jmp MISS_L3CACHE
DD7H://2048,8w
cmp dl, 0xD7
jne DD8H
add ebp, 2048d
jmp MISS_L3CACHE
DD8H://4096,8w
cmp dl, 0xD8
jne DDCH
add ebp, 4096d
jmp MISS_L3CACHE
DDCH://1536,12w
cmp dl, 0xDC
jne DDDH
add ebp, 1536d
jmp MISS_L3CACHE
DDDH://3072,12w
cmp dl, 0xDD
jne DDEH
add ebp, 3072d
jmp MISS_L3CACHE
DDEH://6144,12w
cmp dl, 0xDE
jne DE2H
add ebp, 6144d
jmp MISS_L3CACHE
DE2H://2048,16w
cmp dl, 0xE2
jne DE3H
add ebp, 2048d
jmp MISS_L3CACHE
DE3H://4096,16w
cmp dl, 0xE3
jne DE4H
add ebp, 4096d
jmp MISS_L3CACHE
DE4H://8192,16w
cmp dl, 0xE4
jne DEAH
add ebp, 8192d
jmp MISS_L3CACHE
DEAH://12mb,24w
cmp dl, 0xEA
jne DEBH
add ebp, 12288d
jmp MISS_L3CACHE
DEBH://18mb,24w
cmp dl, 0xEB
jne DECH
add ebp, 18432d
jmp MISS_L3CACHE
DECH://24mb,24w
cmp dl, 0xEC
jne MISS_L3CACHE
add ebp, 24576d
MISS_L3CACHE:
dec cl
cmp cl, 0
shr edx, 8 // it's 8-bit descriptor
jne PARSE_DESCR
dec ch
cmp ch, 0
jne PARSE_REG
CALL_FINISH:
pop eax
cmp al, 0
je CYCLE_FINISH // replace to je then
dec al
jmp CYCLE_BEGIN
CYCLE_FINISH:
mov edx, ebp
shl edx, 8 // 8 bits for cache string length
mov dl, 64d // Intel always has 64 byte L3 string
add esp, 4
pop ebp
jmp RET_ARG
CPU_AMD:
mov eax, 0x80000006 // L3 AMD
cpuid
shl edx, 24
shr edx, 24
RET_ARG:
mov eax, [esp+4] // first argument lies here
mov [eax], edx // return by reference
ret
}
}
There are a number of problems with your code. You should use the __cpuid
compiler intrinsic and write it completely in C++. It'll make the code much easier to write and maintain.
There are two major problems with your code. The first is that you're not using CPUID function 2 correctly. The value in ECX is ignored when you use this function. The second is that you're not using CPUID function 4 to determine the cache size when function 2 returns an 0FFh
descriptor.
Other problems with your code include:
shr edx, 8
sets the flags. The loop works anyways because when EDX becomes 0 it doesn't contain any more possible L3 descriptors.Part of your problem is that you're using an outdated manual. You should use the latest Intel Software Developers Manual.
It's not very well tested, it's probably got some transcription errors in cache descriptor switch statement, but here's a C implementation that uses CPUID functions 2 and 4 to determine the size, associativity and cache line size of the L3 cache:
#include <intrin.h>
int
get_intel_l3_info(unsigned *size, unsigned *assoc, unsigned *linesize) {
int regs[4];
int i;
__cpuid(regs, 0); /* Maximum Input Value */
int max_leaf = regs[0];
if (max_leaf < 2) {
return -1; /* no way to find L3 cache info */
}
__cpuid(regs, 1); /* Additional Information */
int family = (regs[0] >> 8) & 0xF;
int model = (regs[0] >> 4) & 0xF;
__cpuid(regs, 2); /* Cache and TLB Information */
regs[0] &= 0xFFFFFF00; /* least significant byte of EAX is invalid */
for (i = 0; i < 4; i++) {
if (regs[i] < 0) { /* invalid if most significant bit set */
regs[i] = 0;
}
}
unsigned char *descriptors = (unsigned char *) regs;
const int kb = 1024;
const int mb = 1024 * kb;
#define RETINFO(s, a, l) *size = (s); *assoc = (a); *linesize = (l); return 0
int use_leaf_4 = 0;
for (i = 0; i < 32; i++) {
switch(descriptors[i]) {
case 0x22: RETINFO(512 * kb, 4, 64);
case 0x23: RETINFO(1 * mb, 8, 64);
case 0x25: RETINFO(2 * mb, 8, 64);
case 0x29: RETINFO(4 * mb, 8, 64);
case 0x40: RETINFO(0, 0, 0); /* no L3 cache */
case 0x46: RETINFO(4 * mb, 4, 64);
case 0x47: RETINFO(8 * mb, 8, 64);
case 0x49:
if (family == 0x0F && model == 0x06) {
RETINFO(4 * mb, 16, 64);
}
break;
case 0x4A: RETINFO(6 * mb, 12, 64);
case 0x4B: RETINFO(8 * mb, 16, 64);
case 0x4C: RETINFO(12 * mb, 12, 64);
case 0x4D: RETINFO(16 * mb, 16, 64);
case 0xD0: RETINFO(512 * kb, 4, 64);
case 0xD1: RETINFO(1 * mb, 4, 64);
case 0xD6: RETINFO(1 * mb, 8, 64);
case 0xD7: RETINFO(2 * mb, 8, 64);
case 0xD8: RETINFO(4 * mb, 8, 64);
case 0xDC: RETINFO(1 * mb + 512 * kb, 12, 64);
case 0xDD: RETINFO(3 * mb, 12, 64);
case 0xDE: RETINFO(6 * mb, 12, 64);
case 0xE2: RETINFO(2 * mb, 16, 64);
case 0xE3: RETINFO(4 * mb, 16, 64);
case 0xE4: RETINFO(8 * mb, 16, 64);
case 0xEA: RETINFO(12 * mb, 24, 64);
case 0xEB: RETINFO(18 * mb, 24, 64);
case 0xEC: RETINFO(24 * mb, 24, 64);
case 0xFF:
use_leaf_4 = 1;
break;
}
}
if (!use_leaf_4 || max_leaf < 4) {
return -1; /* failed, no L3 info found */
}
i = 0;
while(1) {
__cpuidex(regs, 4, i); /* Deterministic Cache Parameters */
if ((regs[0] & 0x1F) == 0) {
return RETINFO(0, 0, 0); /* no L3 cache */
}
if (((regs[0] >> 5) & 0x7) == 3) {
int lsize = (regs[1] & 0xFFF) + 1;
int partitions = ((regs[1] >> 12) & 0x3FF) + 1;
int ways = ((regs[1] >> 22) & 0x3FF) + 1;
int sets = regs[2] + 1;
RETINFO(ways * partitions * lsize * sets,
ways, lsize);
}
i++;
}
}
User contributions licensed under CC BY-SA 3.0