Enabling EPT in VMX causes failed entry due to guest state

0

I'm building a hypervisor at home and I'm having an issue with entering VMX when enabling EPT. The following code is used to set the guest mode, and it successfully enters VMX. However, when I enable EPT I'm getting a VMX entry failure with exception no. 33 (vm entry failure due to guest state). This is when I uncomment the following code:

vmcs_write(SECONDARY_VM_EXEC_CONTROL, adjust_msr_control(
        MSR_IA32_VMX_PROCBASED_CTLS2, CPU_BASED_CTL2_RDTSCP | CPU_BASED_CTL2_ENABLE_INVPCID /* | CPU_BASED_CTL2_ENABLE_VPID | CPU_BASED_CTL2_ENABLE_XSAVE_XRSTORS */ | CPU_BASED_CTL2_ENABLE_EPT
    )); <--- I add CPU_BASED_CTL2_ENABLE_EPT
vmcs_write64(EPT_POINTER, vms->eptp.value);

I checked with the Intel manual numerous times to make sure I'm following guest state checks and I don't see why my entry fails only when I enable EPT. Any suggestion/help would help, thanks :)

static noinline void vmwrite_error(unsigned long field, unsigned long value){
         printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
                field, value, (int)(vmcs_read(VM_INSTRUCTION_ERROR)));
         dump_stack();
         BUG_ON(1);
}

static void vmcs_write(unsigned long field, unsigned long value){
        uint8_t err;
        __asm__ __volatile__(
                "vmwrite %[value],%[field]; setna %[err]"
                : [err]"=rm"(err)
                : [field]"r"(field), [value]"r"(value)
               : "cc", "memory"
        );
        if(err)
                vmwrite_error(field, value);
        else
                printk(KERN_INFO "vmwrite log: reg %lx value %lx\n", field, value);
}

EPTP alloc_ept(int initial_pages_count){
    int i;
    EPTP eptp;
    EPT_PML4E *ept_pml4;
    EPT_PDPTE *ept_pdpt;
    EPT_PDE *ept_pd;
    EPT_PTE *ept_pt;
    eptp.value = 0;

    ept_pml4 = kzalloc(4096, GFP_KERNEL | GFP_NOWAIT);
    if(!ept_pml4)
        goto pml4err;
    ept_pdpt = kzalloc(4096, GFP_KERNEL | GFP_NOWAIT);
    if(!ept_pdpt)
        goto pdpterr;
    ept_pd = kzalloc(4096, GFP_KERNEL | GFP_NOWAIT);
    if(!ept_pd)
        goto pderr;
    ept_pt = kzalloc(4096, GFP_KERNEL | GFP_NOWAIT);
    if(!ept_pt)
        goto pterr; 

    for(i = 0; i < initial_pages_count; i++){
        ept_pt[i].fields.read_access = 1;
        ept_pt[i].fields.write_access = 1;
        ept_pt[i].fields.execute_access = 1;
        ept_pt[i].fields.ept_memtype = 6;
        ept_pt[i].fields.phys_addr = virt_to_phys(kzalloc(4096, GFP_KERNEL | GFP_NOWAIT));
    }

    ept_pd->fields.read_access = 1;
    ept_pd->fields.write_access = 1;
    ept_pd->fields.execute_access = 1;
    ept_pd->fields.phys_addr = virt_to_phys(ept_pt);    

    ept_pdpt->fields.read_access = 1;
    ept_pdpt->fields.write_access = 1;
    ept_pdpt->fields.execute_access = 1;
    ept_pdpt->fields.phys_addr = virt_to_phys(ept_pd);

    ept_pml4->fields.read_access = 1;
    ept_pml4->fields.write_access = 1;
    ept_pml4->fields.execute_access = 1;
    ept_pml4->fields.phys_addr = virt_to_phys(ept_pdpt);

    eptp.fields.memtype = 6;
    eptp.fields.page_walk = 3;
    eptp.fields.accessed_and_dirty_flags_enabled = 1;
    eptp.fields.pml4_phys_addr = virt_to_phys(ept_pml4);

    return eptp;

    pterr:
    kfree(ept_pd);
    pderr:
    kfree(ept_pdpt);
    pdpterr:
    kfree(ept_pml4);
    pml4err:
    panic("EPT ALLOC ERROR!");
}

static void setup_vm_code(vmstate *vms){
    int i;
        EPT_PML4E *pml = phys_to_virt(vms->eptp.fields.pml4_phys_addr);
        EPT_PDPTE *pdpt = phys_to_virt(pml->fields.phys_addr);
        EPT_PDE *pd = phys_to_virt(pdpt->fields.phys_addr);
        EPT_PTE *pt = phys_to_virt(pd->fields.phys_addr);

    vms->initial_rip = (unsigned long)phys_to_virt(pt[0].fields.phys_addr);
    for(i = 0; i < 4096; i++){  
        // hlt
        *(char*)(vms->initial_rip+i) = 0xf4;
    }
    printk(KERN_INFO "INITIAL_RIP: %lu", vms->initial_rip);
    // Stack grows down
    vms->initial_rsp = (unsigned long)phys_to_virt(pt[9].fields.phys_addr) + 4095;
}

static void prepare_vmx_cpu(void *info){
    uint32_t vmcs_revid = 0;
    uint32_t hi = 0;
    vmstate *vms = per_cpu(cpu_vms, smp_processor_id());

    // Populate VMCS revision id in vmxon region
    rdmsr_safe(MSR_IA32_VMX_BASIC, &vmcs_revid, &hi);
    memcpy(vms->vmxon_region, &vmcs_revid, 4);
    memcpy(vms->vmcs_region, &vmcs_revid, 4);

    vms->eptp = alloc_ept(10);
    setup_vm_code(vms);

    vmx_enable();   
}

//static void handle_vmexit(void) __attribute__((used));
static void handle_vmexit(void){
    int exit_reason = vmcs_read32(VM_EXIT_REASON);
    int basic_exit_code = exit_reason & 0xffff;
    int exit_qualification = vmcs_read32(EXIT_QUALIFICATION);
    int vm_entry_failure = exit_reason & 0x80000000;
    panic("VMEXIT WITH CODE %d, VM ENTRY FAILURE: %s, QUAL: %d", basic_exit_code, vm_entry_failure ? "true" : "false", exit_qualification);
    vmx_dump_cpu();
    panic("ERR");
    VMRESUME();
    //TODO: switch error reasons
}

static void vmx_setup_vm_controls(void){
    // VM Execution Controls
    vmcs_write(PIN_BASED_VM_EXEC_CONTROL, adjust_msr_control(MSR_IA32_VMX_PINBASED_CTLS, 0));
    vmcs_write(CPU_BASED_VM_EXEC_CONTROL, adjust_msr_control(
        MSR_IA32_VMX_PROCBASED_CTLS, CPU_BASED_HLT_EXITING | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS));
    vmcs_write(SECONDARY_VM_EXEC_CONTROL, adjust_msr_control(
        MSR_IA32_VMX_PROCBASED_CTLS2, CPU_BASED_CTL2_RDTSCP | CPU_BASED_CTL2_ENABLE_INVPCID /* | CPU_BASED_CTL2_ENABLE_VPID | CPU_BASED_CTL2_ENABLE_XSAVE_XRSTORS */ | CPU_BASED_CTL2_ENABLE_EPT
    ));

    //vmcs_write64(TSC_OFFSET, 0);  

    vmcs_write(CR0_READ_SHADOW, read_cr0());
    vmcs_write(CR4_READ_SHADOW, __read_cr4());
    vmcs_write(CR0_GUEST_HOST_MASK, ~0ul);
    vmcs_write(CR4_GUEST_HOST_MASK, ~0ul);

    // How many CR3_TARGET_VALUEs are considered without VM exit when MOV CR3, VAL
    vmcs_write(CR3_TARGET_COUNT, 0);

    // VM Entry & Exit Controls
    vmcs_write(VM_EXIT_CONTROLS, adjust_msr_control(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_IA32E_MODE | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_HOST_ADDR_SPACE_SIZE));
    vmcs_write(VM_ENTRY_CONTROLS, adjust_msr_control(MSR_IA32_VMX_ENTRY_CTLS, VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER));
}

static void vmx_setup_initial_host_state(vmstate *vms){
    struct desc_ptr gdtptr, idt;

    vmcs_write(HOST_CR0, read_cr0());
    vmcs_write(HOST_CR3, __read_cr3());
    vmcs_write(HOST_CR4, __read_cr4());
    vmcs_write(HOST_RSP, (unsigned long)vms->vmm_handle_stack + vms->vmm_handle_stack_size - 1);
    vmcs_write(HOST_RIP, (unsigned long)handle_vmexit);

    /* An explanation of segment selectors: https://medium.com/hungys-blog/linux-kernel-memory-addressing-a0d304283af3 */
    // Segment Selectors
    vmcs_write(HOST_CS_SELECTOR, __KERNEL_CS);
    vmcs_write(HOST_DS_SELECTOR, __KERNEL_DS);
    vmcs_write(HOST_ES_SELECTOR, __KERNEL_DS);
    vmcs_write(HOST_SS_SELECTOR, __KERNEL_DS);
    vmcs_write(HOST_FS_SELECTOR, 0);
    vmcs_write(HOST_GS_SELECTOR, 0);
    vmcs_write(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);

    // Segment Base Adresses
    vmcs_write(HOST_FS_BASE, native_read_msr(MSR_FS_BASE));
    vmcs_write(HOST_GS_BASE, native_read_msr(MSR_GS_BASE));
    vmcs_write(HOST_TR_BASE, read_tr_base());
    native_store_gdt(&gdtptr);
    vmcs_write(HOST_GDTR_BASE, gdtptr.address);
    store_idt(&idt);
    vmcs_write(HOST_IDTR_BASE, idt.address);

    // MSRs
    vmcs_write(HOST_IA32_SYSENTER_CS, native_read_msr(MSR_IA32_SYSENTER_CS));
    vmcs_write(HOST_IA32_SYSENTER_ESP, native_read_msr(MSR_IA32_SYSENTER_ESP));
    vmcs_write(HOST_IA32_SYSENTER_EIP, native_read_msr(MSR_IA32_SYSENTER_EIP));
    vmcs_write64(HOST_IA32_EFER, native_read_msr(MSR_EFER));
}

static void RIPTEST(void) __attribute__((used));
static void RIPTEST(void){
    __asm__ __volatile__("hlt; hlt; hlt; hlt; hlt; hlt");
}

static void vmx_setup_initial_guest_state(vmstate *vms){
    vmcs_write(GUEST_CR0, read_cr0());
    vmcs_write(GUEST_CR3, __read_cr3());
    vmcs_write(GUEST_CR4, __read_cr4());
    vmcs_write(GUEST_DR7, 0);

    vmcs_write(GUEST_RIP, vms->initial_rip);
    //vmcs_write(GUEST_RIP, (unsigned long)RIPTEST);
    vmcs_write(GUEST_RSP, vms->initial_rsp);
    vmcs_write(GUEST_RFLAGS, 0x2); // Reserved flag

    // Setup selectors
    vmcs_write(GUEST_CS_SELECTOR, 0);
    vmcs_write(GUEST_SS_SELECTOR, 0);
    vmcs_write(GUEST_DS_SELECTOR, 0);
    vmcs_write(GUEST_ES_SELECTOR, 0);
    vmcs_write(GUEST_FS_SELECTOR, 0);
    vmcs_write(GUEST_GS_SELECTOR, 0);
    vmcs_write(GUEST_LDTR_SELECTOR, 0);
    vmcs_write(GUEST_TR_SELECTOR, 0);

    // Setup base addresses
    vmcs_write(GUEST_CS_BASE, 0);
    vmcs_write(GUEST_SS_BASE, 0);
    vmcs_write(GUEST_DS_BASE, 0);
    vmcs_write(GUEST_ES_BASE, 0);
    vmcs_write(GUEST_FS_BASE, native_read_msr(MSR_FS_BASE));
    vmcs_write(GUEST_GS_BASE, native_read_msr(MSR_GS_BASE));
    vmcs_write(GUEST_LDTR_BASE, 0);
    vmcs_write(GUEST_TR_BASE, 0);

    // Setup guest segment limits   
    vmcs_write(GUEST_CS_LIMIT, 0xFFFFFFFF);
    vmcs_write(GUEST_SS_LIMIT, 0xFFFFFFFF);
    vmcs_write(GUEST_DS_LIMIT, 0xFFFFFFFF);
    vmcs_write(GUEST_ES_LIMIT, 0xFFFFFFFF);
    vmcs_write(GUEST_FS_LIMIT, 0xFFFFFFFF);
    vmcs_write(GUEST_GS_LIMIT, 0xFFFFFFFF);
    vmcs_write(GUEST_LDTR_LIMIT, 0);
    vmcs_write(GUEST_TR_LIMIT, 0xFF);

    // Setup guest segment access rights
    // https://www.amd.com/system/files/TechDocs/24593.pdf#G10.910849
    vmcs_write(GUEST_CS_AR_BYTES, 0xA09B);
    vmcs_write(GUEST_SS_AR_BYTES, 0xA093);
    vmcs_write(GUEST_DS_AR_BYTES, 0xA093);
    vmcs_write(GUEST_ES_AR_BYTES, 0xA093);
    vmcs_write(GUEST_FS_AR_BYTES, 0xA093);
    vmcs_write(GUEST_GS_AR_BYTES, 0xA093);
    vmcs_write(GUEST_LDTR_AR_BYTES, 0x0082);
    vmcs_write(GUEST_TR_AR_BYTES, 0x008B);

    // Setup GDTR & IDTR
    vmcs_write(GUEST_GDTR_BASE, 0);
    vmcs_write(GUEST_IDTR_BASE, 0);
    vmcs_write(GUEST_GDTR_LIMIT, 0);
    vmcs_write(GUEST_IDTR_LIMIT, 0);

    vmcs_write(GUEST_IA32_EFER, native_read_msr(MSR_EFER));
    vmcs_write64(GUEST_IA32_DEBUGCTL, 0);

    // Setup sysenter primitives
    vmcs_write(GUEST_SYSENTER_CS, 0);
    vmcs_write(GUEST_SYSENTER_ESP, 0);
    vmcs_write(GUEST_SYSENTER_EIP, 0);
}

static void init_vmcs(vmstate *vms){
    VMPTRLD(vms->vmcs_physical);
    vmx_setup_vm_controls();
    vmx_setup_initial_guest_state(vms);
    vmx_setup_initial_host_state(vms);

    vmcs_write64(VMCS_LINK_POINTER, -1ull);

    //vmcs_write(EXCEPTION_BITMAP, 8192);

    vmcs_write64(EPT_POINTER, vms->eptp.value);
    //vmcs_write(VIRTUAL_PROCESSOR_ID, vms->vpid);
}

int vmx_launch(void){
    int cpu = smp_processor_id();
    vmstate *vms = per_cpu(cpu_vms, smp_processor_id());

    printk(KERN_INFO "Launching VM on CPU %d\n", cpu);
    init_vmcs(vms);
    VMLAUNCH();

    put_cpu();
    return 0;
}

int vmx_setup(void){
    int i;
    vmstate* vms;
    printk(KERN_INFO "NUM CPUS: %d\n", num_online_cpus());

    for_each_online_cpu(i){
        vms = create_vmstate();
        vms->vmxon_region = kmalloc(4096, GFP_KERNEL);
        vms->vmxon_physical = virt_to_phys(vms->vmxon_region);
        vms->vmcs_region = kzalloc(4096, GFP_KERNEL);
        vms->vmcs_physical = virt_to_phys(vms->vmcs_region);
        vms->vmm_handle_stack_size = 4096;
        vms->vmm_handle_stack = kmalloc(vms->vmm_handle_stack_size, GFP_KERNEL);
        vms->vpid = get_free_vpid();
        per_cpu(cpu_vms, i) = vms;
    }

    on_each_cpu(prepare_vmx_cpu, NULL, 1);
    printk(KERN_INFO "CPUS prepared!");

    for_each_online_cpu(i){
        vms = per_cpu(cpu_vms, i);
        if(vms->vmx_enabled == false) {
            printk(KERN_ALERT "Tearing down after VMXON failed!");
            vmx_teardown();
            return -1;
        }
    }
    printk(KERN_INFO "VMX turned on for all CPUs!");
    return 0;
}

VMCS dump:

***Guest State***
[   72.414906] CR0: actual=0x0000000080050033, shadow=0x0000000080050033, gh_mask=ffffffffffffffff
[   72.416865] CR4: actual=0x00000000000626e0, shadow=0x00000000000626e0, gh_mask=ffffffffffffffff
[   72.419147] CR3 = 0x00000000307ce004
[   72.419950] PDPTR0 = 0x0000000000000000  PDPTR1 = 0x0000000000000000
[   72.421384] PDPTR2 = 0x0000000000000000  PDPTR3 = 0x0000000000000000
[   72.422753] RSP = 0xffff9c9cb31f8fff  RIP = 0xffff9c9cb5005000
[   72.424510] RFLAGS=0x00000002         DR7 = 0x0000000000000000
[   72.426501] Sysenter RSP=0000000000000000 CS:RIP=0000:0000000000000000
[   72.428141] CS:   sel=0x0000, attr=0x0a09b, limit=0xffffffff, base=0x0000000000000000
[   72.430162] DS:   sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0x0000000000000000
[   72.432075] SS:   sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0x0000000000000000
[   72.433982] ES:   sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0x0000000000000000
[   72.436152] FS:   sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0x00007f8e51f0c4c0
[   72.438437] GS:   sel=0x0000, attr=0x0a093, limit=0xffffffff, base=0xffff9c9cbeb00000
[   72.440579] GDTR:                           limit=0x00000000, base=0x0000000000000000
[   72.442241] LDTR: sel=0x0000, attr=0x00082, limit=0x00000000, base=0x0000000000000000
[   72.443414] IDTR:                           limit=0x00000000, base=0x0000000000000000
[   72.444591] TR:   sel=0x0000, attr=0x0008b, limit=0x000000ff, base=0x0000000000000000
[   72.447023] EFER =     0x0000000000000d01  PAT = 0x0000000000000000
[   72.448999] DebugCtl = 0x0000000000000000  DebugExceptions = 0x0000000000000000
[   72.451813] PerfGlobCtl = 0x0000000000000000
[   72.453316] BndCfgS = 0x0000000000000000
[   72.454528] Interruptibility = 00000000  ActivityState = 00000000
[   72.456302] InterruptStatus = 0000
[   72.456997] *** Host State ***
[   72.457622] RIP = 0xffffffffc0789b90  RSP = 0xffff9c9cb5019fff
[   72.458766] CS=0010 SS=0018 DS=0018 ES=0018 FS=0000 GS=0000 TR=0040
[   72.460007] FSBase=00007f8e51f0c4c0 GSBase=ffff9c9cbeb00000 TRBase=0000000000000000
[   72.461588] GDTBase=fffffe000002c000 IDTBase=fffffe0000000000
[   72.462711] CR0=0000000080050033 CR3=00000000307ce004 CR4=00000000000626e0
[   72.464083] Sysenter RSP=fffffe000002d200 CS:RIP=0010:ffffffff848015f0
[   72.465472] EFER = 0x0000000000000d01  PAT = 0x0000000000000000
[   72.467041] PerfGlobCtl = 0x0000000000000000
[   72.468110] *** Control State ***
[   72.469024] PinBased=00000016 CPUBased=8401e1f2 SecondaryExec=0000000a
[   72.470863] EntryControls=000093ff ExitControls=00236fff
[   72.472268] ExceptionBitmap=00000000 PFECmask=00000000 PFECmatch=00000000
[   72.474137] VMEntry: intr_info=00000000 errcode=00000000 ilen=00000000
[   72.475580] VMExit: intr_info=00000000 errcode=00000000 ilen=00000000
[   72.477230]         reason=80000021 qualification=0000000000000000
[   72.478806] IDTVectoring: info=00000000 errcode=00000000
[   72.480156] TSC Offset = 0x0000000000000000
[   72.481316] SVI|RVI = 00|00 TPR Threshold = 0x00
[   72.482305] APIC-access addr = 0x0000000000000000 virt-APIC addr = 0x0000000000000000
[   72.484216] PostedIntrVec = 0x00
[   72.484928] EPT pointer = 0x000003500200005e
[   72.485835] Virtual processor ID = 0x0000
c
linux-kernel
virtualization
hypervisor
asked on Stack Overflow Dec 11, 2019 by Paul • edited Dec 13, 2019 by Paul

1 Answer

2

The problem is that EPTP has non-zero bits above the processor physical address width. (The physical address width for i3-2130 is 36 bits, I think.)

This should not be reported as an invalid guest state error. Instead it should be an invalid control field error (failed VM entry with error code 7), which is what I see when I test this on real hardware. I think KVM is virtualizing this error incorrectly.

The only way that enabling EPT can cause an invalid guest state error is if the PDPTEs are invalid, which can only happen if the guest paging mode is PAE, not ia32e. (Section 26.3.1.6.)

The problem in the code is that it needs to right shift the address by 12 before storing it into the phys_addr field. See the definition of EPTP in section 24.6.11. The pml4_phys_addr field should contain bits 35:12 of the physical address. Bits 11:0 are not represented (since they are all 0). You can use one of these solutions:

Option A:

eptp.fields.memtype = 6;
eptp.fields.page_walk = 3;
eptp.fields.accessed_and_dirty_flags_enabled = 1;
eptp.fields.pml4_phys_addr = virt_to_phys(ept_pml4) >> 12;

Option B:

eptp.fields.memtype = 6;
eptp.fields.page_walk = 3;
eptp.fields.accessed_and_dirty_flags_enabled = 1;
eptp.value |= virt_to_phys(ept_pml4);

Option C:

eptp.value = virt_to_phys(ept_pml4);
eptp.fields.memtype = 6;
eptp.fields.page_walk = 3;
eptp.fields.accessed_and_dirty_flags_enabled = 1;

Make a similar change to all the code that initializes EPT entries.

answered on Stack Overflow Dec 13, 2019 by prl • edited Dec 13, 2019 by prl

User contributions licensed under CC BY-SA 3.0