I encountered a trouble during getting L3 cache info on Intel processors. Getting L3 line length on AMD is simple, like this:
mov eax, 0x80000006 
cpuid
shl edx, 24
shr edx, 24
The same operation on Intels is much more complicated. I got that this might be done using this sequence:
mov eax, 2
cpuid
and pasring register values by this manual: http://www.microbe.cz/docs/CPUID.pdf (page 26, "Table 2-7. Descriptor Decode Values").
But my program does not found any of enumerated descriptors and returns 0 for cache size and line length.
Is there any simpler and/or sufficient method to get cache size and line length on Intels?
Here is the full code. All cpuid output (eax, ebx, ecx, edx) is pushed onto stack, then each value is compared with hardcoded descriptors list. Comparation is made on lower 8 bits, then these bits are shrinked.
__declspec(dllexport) __declspec(naked) void GetMetricLevel2(int &length) {
    __asm {
        // check CPUID availability
        pushfd
        pop eax
        mov ebx, eax
        xor eax, 00200000h
        push eax
        popfd
        pushfd
        pop eax
        cmp eax, ebx
        jnz HAS_CPUID
        mov edx, -1 // return -1 by reference
        jmp RET_ARG
HAS_CPUID:
        mov eax, 2 // L3 Intel, incomplete
        mov ecx, 0
        cpuid
        push ecx
        or ecx, eax
        or ecx, ebx
        or ecx, edx
        cmp ecx, 0
        pop ecx // experimental
        je CPU_AMD // if all registers are 0, we try AMD scheme
CPU_INTEL:
        push ebp
        mov ebp, 0
        push 0
        push eax // store counter
        jmp CALL_BEGIN
CYCLE_BEGIN:
        pop ecx
        inc ecx
        push ecx
        push eax
        mov eax, 2
        cpuid
CALL_BEGIN:
        push eax
        push ebx
        push ecx
        push edx
        mov ch, 4
PARSE_REG:
        pop edx
        mov cl, 4
PARSE_DESCR:
DD0H://512,4w
        cmp dl, 0xD0
        jne DD1H
        add ebp, 512d
        jmp MISS_L3CACHE
DD1H://1024,4w
        cmp dl, 0xD1
        jne DD2H
        add ebp, 1024d
        jmp MISS_L3CACHE
DD2H://2048,4w
        cmp dl, 0xD2
        jne DD6H
        add ebp, 2048d
        jmp MISS_L3CACHE
DD6H://1024,8w
        cmp dl, 0xD6
        jne DD7H
        add ebp, 1024d
        jmp MISS_L3CACHE
DD7H://2048,8w
        cmp dl, 0xD7
        jne DD8H
        add ebp, 2048d
        jmp MISS_L3CACHE
DD8H://4096,8w
        cmp dl, 0xD8
        jne DDCH
        add ebp, 4096d
        jmp MISS_L3CACHE
DDCH://1536,12w
        cmp dl, 0xDC
        jne DDDH
        add ebp, 1536d
        jmp MISS_L3CACHE
DDDH://3072,12w
        cmp dl, 0xDD
        jne DDEH
        add ebp, 3072d
        jmp MISS_L3CACHE
DDEH://6144,12w
        cmp dl, 0xDE
        jne DE2H
        add ebp, 6144d
        jmp MISS_L3CACHE
DE2H://2048,16w
        cmp dl, 0xE2
        jne DE3H
        add ebp, 2048d
        jmp MISS_L3CACHE
DE3H://4096,16w
        cmp dl, 0xE3
        jne DE4H
        add ebp, 4096d
        jmp MISS_L3CACHE
DE4H://8192,16w
        cmp dl, 0xE4
        jne DEAH
        add ebp, 8192d
        jmp MISS_L3CACHE
DEAH://12mb,24w
        cmp dl, 0xEA
        jne DEBH
        add ebp, 12288d
        jmp MISS_L3CACHE
DEBH://18mb,24w
        cmp dl, 0xEB
        jne DECH
        add ebp, 18432d
        jmp MISS_L3CACHE
DECH://24mb,24w
        cmp dl, 0xEC
        jne MISS_L3CACHE
        add ebp, 24576d
MISS_L3CACHE:
        dec cl
        cmp cl, 0
        shr edx, 8 // it's 8-bit descriptor
        jne PARSE_DESCR
        dec ch
        cmp ch, 0
        jne PARSE_REG
CALL_FINISH:
        pop eax
        cmp al, 0
        je CYCLE_FINISH // replace to je then
        dec al
        jmp CYCLE_BEGIN
CYCLE_FINISH:
        mov edx, ebp
        shl edx, 8 // 8 bits for cache string length
        mov dl, 64d // Intel always has 64 byte L3 string
        add esp, 4
        pop ebp
        jmp RET_ARG
CPU_AMD:
        mov eax, 0x80000006 // L3 AMD
        cpuid
        shl edx, 24
        shr edx, 24
RET_ARG:
        mov eax, [esp+4] // first argument lies here
        mov [eax], edx // return by reference
        ret
    }
}
There are a number of problems with your code. You should use the __cpuid compiler intrinsic and write it completely in C++. It'll make the code much easier to write and maintain. 
There are two major problems with your code. The first is that you're not using CPUID function 2 correctly. The value in ECX is ignored when you use this function. The second is that you're not using CPUID function 4 to determine the cache size when function 2 returns an 0FFh descriptor.
Other problems with your code include:
shr edx, 8 sets the flags. The loop works anyways because when EDX becomes 0 it doesn't contain any more possible L3 descriptors.Part of your problem is that you're using an outdated manual. You should use the latest Intel Software Developers Manual.
It's not very well tested, it's probably got some transcription errors in cache descriptor switch statement, but here's a C implementation that uses CPUID functions 2 and 4 to determine the size, associativity and cache line size of the L3 cache:
#include <intrin.h>
int
get_intel_l3_info(unsigned *size, unsigned *assoc, unsigned *linesize) {
    int regs[4];
    int i;
    __cpuid(regs, 0); /* Maximum Input Value */
    int max_leaf = regs[0];
    if (max_leaf < 2) {
        return -1; /* no way to find L3 cache info */
    }
    __cpuid(regs, 1); /* Additional Information */
    int family = (regs[0] >> 8) & 0xF;
    int model = (regs[0] >> 4) & 0xF;
    __cpuid(regs, 2); /* Cache and TLB Information */
    regs[0] &= 0xFFFFFF00; /* least significant byte of EAX is invalid */
    for (i = 0; i < 4; i++) {
        if (regs[i] < 0) { /* invalid if most significant bit set */
            regs[i] = 0;
        }
    }
    unsigned char *descriptors = (unsigned char *) regs;
    const int kb = 1024;
    const int mb = 1024 * kb;
#define RETINFO(s, a, l) *size = (s); *assoc = (a); *linesize = (l); return 0
    int use_leaf_4 = 0;
    for (i = 0; i < 32; i++) {
        switch(descriptors[i]) {
        case 0x22: RETINFO(512 * kb, 4, 64);
        case 0x23: RETINFO(1 * mb, 8, 64);
        case 0x25: RETINFO(2 * mb, 8, 64);
        case 0x29: RETINFO(4 * mb, 8, 64);
        case 0x40: RETINFO(0, 0, 0); /* no L3 cache */
        case 0x46: RETINFO(4 * mb, 4, 64);
        case 0x47: RETINFO(8 * mb, 8, 64);
        case 0x49:
            if (family == 0x0F && model == 0x06) {
                RETINFO(4 * mb, 16, 64);
            }
            break;
        case 0x4A: RETINFO(6 * mb, 12, 64);
        case 0x4B: RETINFO(8 * mb, 16, 64);
        case 0x4C: RETINFO(12 * mb, 12, 64);
        case 0x4D: RETINFO(16  * mb, 16, 64);
        case 0xD0: RETINFO(512 * kb, 4, 64);
        case 0xD1: RETINFO(1 * mb, 4, 64);
        case 0xD6: RETINFO(1 * mb, 8, 64);
        case 0xD7: RETINFO(2 * mb, 8, 64);
        case 0xD8: RETINFO(4 * mb, 8, 64);
        case 0xDC: RETINFO(1 * mb + 512 * kb, 12, 64);
        case 0xDD: RETINFO(3 * mb, 12, 64);
        case 0xDE: RETINFO(6 * mb, 12, 64);
        case 0xE2: RETINFO(2 * mb, 16, 64);
        case 0xE3: RETINFO(4 * mb, 16, 64);
        case 0xE4: RETINFO(8 * mb, 16, 64);
        case 0xEA: RETINFO(12 * mb, 24, 64);
        case 0xEB: RETINFO(18 * mb, 24, 64);
        case 0xEC: RETINFO(24 * mb, 24, 64);
        case 0xFF:
            use_leaf_4 = 1;
            break;
        }
    }
    if (!use_leaf_4 || max_leaf < 4) {
        return -1; /* failed, no L3 info found */
    }
    i = 0;
    while(1) {
        __cpuidex(regs, 4, i); /* Deterministic Cache Parameters */
        if ((regs[0] & 0x1F) == 0) {
            return RETINFO(0, 0, 0); /* no L3 cache */
        }
        if (((regs[0] >> 5) & 0x7) == 3) {
            int lsize = (regs[1] & 0xFFF) + 1;
            int partitions = ((regs[1] >> 12) & 0x3FF) + 1;
            int ways = ((regs[1] >> 22) & 0x3FF) + 1;
            int sets = regs[2] + 1;
            RETINFO(ways * partitions * lsize * sets,
                ways, lsize);
        }
        i++;
    }
}
User contributions licensed under CC BY-SA 3.0