What is the right sequence to start VMX root operation in Linux

0

I am working on a Linux kernel module (KVM and KVM_intel have already been unloaded) to test Intel's VMX function. And now I am wondering the pre-requisite of entering VMX root mode. My kernel modules uses Linux file system interface to provide a device interface to user space program to do VMX operations.
Here is the code for reference (it was from https://www.cs.usfca.edu/~cruse/cs686s07/nmiexits.c, and changed to work with my Linux-2.6.32, and other needed files were also from that link)

//-------------------------------------------------------------------
//    nmiexits.c    (A modification of our 'linuxvmm.c' module)
//
//    This Linux kernel module implements a device-driver (named
//    '/dev/vmm') which lets an application program execute some
//    real-mode code in Virtual-8086 mode within a guest virtual
//    machine, assuming the cpu supports Intel VMX instructions.
//
//    This modification sets the pin-based VM Execution Controls
//    so that control passes to our Virtual Machine Manager when
//    any external interrupt (or non-maskable interrupt) occurs.
//    These asynchronous events are then serviced by Linux ISRs,
//    and our guest VM is resumed.  For the case of non-maskable
//    interrupts, the host executes the 'int $0x02' instruction;
//    for the case of external interrupts, the appropriate Linux
//    interrupt service routine automatically gets executed when
//    the host executes 'sti', which allows the CPU to recognize
//    the still-pending external interrupt-request.
//
//        compile using:  $ mmake nmiexits
//        install using:  $ /sbin/insmod nmiexits.ko
//
//    NOTE: Written and tested using Linux x86_64 kernel 2.6.17.
//
//    programmer: ALLAN CRUSE
//    date begun: 29 APR 2007
//    completion: 03 MAY 2007    -- our initial driver-prototype
//    revised on: 14 MAY 2007 -- sets 'interrupt-exiting' control
//    revised on: 24 MAY 2007 -- sets the 'NMI-exiting' control
//    revised on: 21 JUL 2008 -- for Linux kernel version 2.6.26.
//-------------------------------------------------------------------

#include <linux/kernel.h>
#include <linux/module.h>    // for init_module()
#include <linux/proc_fs.h>    // for create_proc_read_entry()
#include <linux/fs.h>        // for struct file_operations
#include <asm/io.h>        // for virt_to_phys()
#include <asm/uaccess.h>    // for copy_from_user()
#include <linux/slab.h> // for init_module()
#include <linux/mm.h>           // for remap_pfn_range()
#include <linux/seq_file.h>
#include "machine.h"        // for our VMCS fields
#include "myvmx.h"        // for 'regs_ia32'

#define N_ARENAS    11    // number of 64KB memory allocations
#define ARENA_LENGTH  (64<<10)    // size of each allocated memory-arena
#define IA32_VMX_BASIC               0x0480
#define IA32_VMX_PINBASED_CTLS       0x0481
#define IA32_VMX_PROCBASED_CTLS      0x0482
#define IA32_VMX_EXIT_CTLS           0x0483
#define IA32_VMX_ENTRY_CTLS          0x0484
#define IA32_VMX_MISC                0x0485
#define IA32_VMX_CR0_FIXED0          0x0486
#define IA32_VMX_CR0_FIXED1          0x0487
#define IA32_VMX_CR4_FIXED0          0x0488
#define IA32_VMX_CR4_FIXED1          0x0489
#define IA32_VMX_VMCS_ENUM           0x048A
#define IA32_VMX_PROCBASED_CTLS2     0x048B
#define IA32_VMX_EPT_VPID_CAP        0x048C
#define IA32_VMX_TRUE_PINBASED_CTLS  0x048D
#define IA32_VMX_TRUE_PROCBASED_CTLS 0x048E
#define IA32_VMX_TRUE_EXIT_CTLS      0x048F
#define IA32_VMX_TRUE_ENTRY_CTLS     0x0490
#define NUM_VMX_MSR                 (IA32_VMX_TRUE_ENTRY_CTLS - IA32_VMX_BASIC + 1) 
#define LEGACY_REACH  0x110000    // end of 'real-addressible' memory

#define PAGE_DIR_OFFSET    0x2000
#define PAGE_TBL_OFFSET    0x3000
#define IDT_KERN_OFFSET    0x4000
#define GDT_KERN_OFFSET    0x4800
#define LDT_KERN_OFFSET    0x4A00
#define TSS_KERN_OFFSET    0x4C00
#define TOS_KERN_OFFSET    0x8000
#define ISR_KERN_OFFSET    0x8000
#define __SELECTOR_TASK    0x0008
#define __SELECTOR_LDTR    0x0010
#define __SELECTOR_CODE    0x0004
#define __SELECTOR_DATA    0x000C
#define __SELECTOR_VRAM    0x0014
#define __SELECTOR_FLAT    0x001C

char modname[] = "nmiexits";
int my_major = 88;
char cpu_oem[16];
int cpu_features;
void *kmem[ N_ARENAS ];
unsigned long msr0x480[ NUM_VMX_MSR ];
unsigned long cr0, cr4;
unsigned long msr_efer;
unsigned long vmxon_region;
unsigned long guest_region;
unsigned long pgdir_region;
unsigned long pgtbl_region;
unsigned long g_IDT_region;
unsigned long g_GDT_region;
unsigned long g_LDT_region;
unsigned long g_TSS_region;
unsigned long g_TOS_region;
unsigned long g_ISR_region;

//============================================================
long my_ioctl( struct file *, unsigned int, unsigned long );
int my_release ( struct inode *inode, struct file *file );

int my_mmap( struct file *file, struct vm_area_struct *vma )
{
    unsigned long    user_virtaddr = vma->vm_start;
    unsigned long    region_length = vma->vm_end - vma->vm_start;
    unsigned long    physical_addr, pfn;
    int        i;

    // we require prescribed parameter-values from our client
    if ( user_virtaddr != 0x00000000L ) return -EINVAL;
    if ( region_length != LEGACY_REACH ) return -EINVAL;

    // let the kernel know not to try swapping out this region
    /// vma->vm_flags |= VM_RESERVED;

    // ask the kernel to add page-table entries to 'map' these arenas
    for (i = 0; i < N_ARENAS+6; i++)
        {
        int    j = i % 16;
        if ( j < 0xA ) physical_addr = virt_to_phys( kmem[ j ] );
        else    physical_addr = user_virtaddr;
        pfn = ( physical_addr >> PAGE_SHIFT );
        if ( remap_pfn_range( vma, user_virtaddr, pfn,
            ARENA_LENGTH, vma->vm_page_prot ) ) return -EAGAIN;
        user_virtaddr += ARENA_LENGTH;
        }

    // copy page-frame 0x000 to bottom of arena 0x0 (for IVT and BDA)
    memcpy( kmem[0], phys_to_virt( 0x00000 ), PAGE_SIZE );

    // copy page-frames 0x90 to 0x9F to arena 0x9 (for EBDA)    
    memcpy( kmem[9], phys_to_virt( 0x90000 ), ARENA_LENGTH );    

    return    0;    // SUCCESS
}


struct file_operations    
my_fops =    {
        owner:        THIS_MODULE,
        .unlocked_ioctl=        my_ioctl,
        mmap:        my_mmap,
        .release        =   my_release,
};

void set_CR4_vmxe( void *dummy )
{
    asm(    " mov %%cr4, %%rax     \n"\
        " bts $13, %%rax     \n"\
        " mov %%rax, %%cr4    " ::: "ax" );
}

void clear_CR4_vmxe( void *dummy )
{
    asm(    " mov %%cr4, %%rax     \n"\
        " btr $13, %%rax     \n"\
        " mov %%rax, %%cr4    " ::: "ax" );
}

static inline u64
vmx_rdmsr (u32 ecx)
{
    u32 edx, eax;

    asm volatile ("rdmsr":"=d" (edx), "=a" (eax):"c" (ecx));
    return (((u64) edx) << 32) | ((u64) eax);
}

static inline void
vmx_wrmsr (u32 ecx, u64 val)
{
    u32 edx, eax;

    edx = (u32) (val >> 32);
    eax = (u32) val;

    asm volatile ("wrmsr"::"d" (edx), "a" (eax), "c" (ecx));
}

int init_module( void )
{
    int    i, j;

    // confirm installation and show device-major number
    printk( "<1>\nInstalling \'%s\' module ", modname );
    printk( "(major=%d) \n", my_major );

    // verify processor supports Intel Virtualization Technology
    asm(    " xor     %%eax, %%eax        \n"\
        " cpuid                \n"\
        " mov    %%ebx, cpu_oem+0     \n"\
        " mov    %%edx, cpu_oem+4     \n"\
        " mov    %%ecx, cpu_oem+8     \n"\
        ::: "ax", "bx", "cx", "dx"    );
    printk( " processor is \'%s\' \n", cpu_oem );

    if ( strncmp( cpu_oem, "GenuineIntel", 12 ) == 0 )
        asm(    " mov    $1, %%eax        \n"\
            " cpuid                \n"\
            " mov    %%ecx, cpu_features    \n"\
            ::: "ax", "bx", "cx", "dx"     );
    if ( ( cpu_features & (1<<5) ) == 0 )
        {
        printk( " Virtualization Technology is unsupported \n" );
        return    -ENODEV;
        }
    else    printk( " Virtualization Technology is supported \n" );

    // read contents of the VMX-Capability Model-Specific Registers
    asm(    " xor    %%rbx, %%rbx            \n"\
        " mov    %0, %%rcx            \n"\
        "nxcap:                    \n"\
        " rdmsr                    \n"\
        " mov    %%eax, msr0x480+0(, %%rbx, 8)    \n"\
        " mov    %%edx, msr0x480+4(, %%rbx, 8)    \n"\
        " inc    %%rcx                \n"\
        " inc    %%rbx                \n"\
        " cmp    $17, %%rbx            \n"\
        " jb    nxcap                \n"\
        :: "i" (IA32_VMX_BASIC) : "ax", "bx", "cx", "dx"     );

    // preserve the initial values in relevant system registers
    asm( " mov %%cr0, %%rax \n mov %%rax, cr0 " ::: "ax" );
    asm( " mov %%cr4, %%rax \n mov %%rax, cr4 " ::: "ax" );

    asm(    " mov    %0, %%ecx        \n"\
        " rdmsr                \n"\
        " mov    %%eax, msr_efer+0    \n"\
        " mov    %%edx, msr_efer+4    \n"\
        :: "i" (MSR_EFER) : "ax", "cx", "dx" );

    // allocate page-aligned blocks of non-pageable kernel memory
    for (i = 0; i < N_ARENAS; i++)
        {
        kmem[ i ] = kmalloc( ARENA_LENGTH, GFP_KERNEL );
        if ( kmem[ i ] == NULL )
            {
            for (j = 0; j < i; j++) kfree( kmem[ j ] );
            return    -ENOMEM;
            }
        else    memset( kmem[ i ], 0x00, ARENA_LENGTH );
        }

    // assign usages to allocated kernel memory areas
    vmxon_region = virt_to_phys( kmem[ 10 ] + 0x0000 );
    guest_region = virt_to_phys( kmem[ 10 ] + 0x1000 );
    pgdir_region = virt_to_phys( kmem[ 10 ] + PAGE_DIR_OFFSET );
    pgtbl_region = virt_to_phys( kmem[ 10 ] + PAGE_TBL_OFFSET );
    g_IDT_region = virt_to_phys( kmem[ 10 ] + IDT_KERN_OFFSET );
    g_GDT_region = virt_to_phys( kmem[ 10 ] + GDT_KERN_OFFSET );
    g_LDT_region = virt_to_phys( kmem[ 10 ] + LDT_KERN_OFFSET );
    g_TSS_region = virt_to_phys( kmem[ 10 ] + TSS_KERN_OFFSET );
    g_TOS_region = virt_to_phys( kmem[ 10 ] + TOS_KERN_OFFSET );
    g_ISR_region = virt_to_phys( kmem[ 10 ] + ISR_KERN_OFFSET );

    return    register_chrdev( my_major, modname, &my_fops );
}

void cleanup_module( void )
{
    int    i;

    smp_call_function( clear_CR4_vmxe, NULL, 1 );
    clear_CR4_vmxe( NULL );

    unregister_chrdev( my_major, modname );

    for (i = 0; i < N_ARENAS; i++) kfree( kmem[ i ] );

    printk( "<1>Removing \'%s\' module\n", modname );
}

MODULE_LICENSE("GPL");

unsigned short    _gdtr[ 5 ], _idtr[ 5 ];
unsigned int    _eax, _ebx, _ecx, _edx, _esp, _ebp, _esi, _edi;
int        retval = -1;

int    nmiints = 0;
int    extints = 0;

regs_ia32    vm;

long my_ioctl( struct file *file, unsigned int count, unsigned long buf)
{
    unsigned long   *gdt, *ldt, *idt;
    unsigned int    *pgtbl, *pgdir, *tss, phys_addr = 0;
    signed long     desc = 0;
    int             i, j;

    // sanity check: we require the client-process to pass an
    // exact amount of data representing CPU's register-state
    if ( count != sizeof( regs_ia32 ) ) return -EINVAL;

    // reinitialize the Virtual Machine Control Stuctures
    memset( phys_to_virt( vmxon_region ), 0x00, PAGE_SIZE );
    memset( phys_to_virt( guest_region ), 0x00, PAGE_SIZE );
    memcpy( phys_to_virt( vmxon_region ), msr0x480, 4 );
    memcpy( phys_to_virt( guest_region ), msr0x480, 4 );

    // initialize our guest-task's page-table and page-directory
    pgtbl = (unsigned int*)phys_to_virt( pgtbl_region );
    for (i = 0; i < 18; i++) {
        switch ( i ) {
            case 0: case 1: case 2: case 3: case 4:
            case 5: case 6: case 7: case 8: case 9:
                phys_addr = virt_to_phys( kmem[ i ] ); break;
            case 10: case 11: case 12: case 13: case 14: case 15:
                phys_addr = i * ARENA_LENGTH; break;
            case 16:
                phys_addr = virt_to_phys( kmem[ 0 ] ); break;
            case 17:
                phys_addr = virt_to_phys( kmem[ 10 ] ); break;
        }
        for (j = 0; j < 16; j++)
            pgtbl[ i*16 + j ] = phys_addr + (j << PAGE_SHIFT) + 7;
    }
    pgdir = (unsigned int*)phys_to_virt( pgdir_region );
    pgdir[ 0 ] = (unsigned int)pgtbl_region + 7;

    // copy the client's virtual-machine register-values
    if ( copy_from_user( &vm, (void*)buf, count ) ) return -EFAULT;
    guest_ES_selector = vm.es;
    guest_CS_selector = vm.cs;
    guest_SS_selector = vm.ss;
    guest_DS_selector = vm.ds;
    guest_FS_selector = vm.fs;
    guest_GS_selector = vm.gs;
    _eax = vm.eax;
    _ebx = vm.ebx;
    _ecx = vm.ecx;
    _edx = vm.edx;
    _ebp = vm.ebp;
    _esi = vm.esi;
    _edi = vm.edi;
    guest_RSP = vm.esp;
    guest_RIP = vm.eip;
    guest_RFLAGS = vm.eflags;
    guest_RFLAGS |= (1 << 17);    // VM=1 (for Virtual-8086 mode)
    guest_RFLAGS |= (1 <<  1);    // it's essential to set bit #1
    // setup other guest-state fields (for Virtual-8086 mode)
    guest_ES_base = (guest_ES_selector << 4);
    guest_CS_base = (guest_CS_selector << 4);
    guest_SS_base = (guest_SS_selector << 4);
    guest_DS_base = (guest_DS_selector << 4);
    guest_FS_base = (guest_FS_selector << 4);
    guest_GS_base = (guest_GS_selector << 4);
    guest_ES_limit = 0xFFFF;
    guest_CS_limit = 0xFFFF;
    guest_SS_limit = 0xFFFF;
    guest_DS_limit = 0xFFFF;
    guest_FS_limit = 0xFFFF;
    guest_GS_limit = 0xFFFF;
    guest_ES_access_rights = 0xF3;
    guest_CS_access_rights = 0xF3;
    guest_SS_access_rights = 0xF3;
    guest_DS_access_rights = 0xF3;
    guest_FS_access_rights = 0xF3;
    guest_GS_access_rights = 0xF3;

    guest_CR0 = 0x80000031;
    guest_CR4 = 0x00002011;
    guest_CR3 = pgdir_region;
    guest_VMCS_link_pointer_full = 0xFFFFFFFF;
    guest_VMCS_link_pointer_high = 0xFFFFFFFF;

    guest_IDTR_base = LEGACY_REACH + IDT_KERN_OFFSET;
    guest_GDTR_base = LEGACY_REACH + GDT_KERN_OFFSET;
    guest_LDTR_base = LEGACY_REACH + LDT_KERN_OFFSET;
    guest_TR_base   = LEGACY_REACH + TSS_KERN_OFFSET;
    guest_IDTR_limit = (256 * 8) - 1;
    guest_GDTR_limit = (3 * 8) - 1;
    guest_LDTR_limit = (4 * 8) - 1;
    guest_TR_limit   = (26 * 4) + 0x20 + 0x2000;
    guest_LDTR_access_rights = 0x82;
    guest_TR_access_rights   = 0x8B;
    guest_LDTR_selector = __SELECTOR_LDTR;
    guest_TR_selector   = __SELECTOR_TASK;

    // provisionally initialize our guest-task's LDTR
    ldt = (unsigned long*)phys_to_virt( g_LDT_region );
    ldt[ __SELECTOR_CODE >> 3 ] = 0x00CF9B000000FFFF;
    ldt[ __SELECTOR_DATA >> 3 ] = 0x00CF93000000FFFF;
    ldt[ __SELECTOR_VRAM >> 3 ] = 0x0000920B8000FFFF;
    ldt[ __SELECTOR_FLAT >> 3 ] = 0x008F92000000FFFF;
    // Adjust the CODE and DATA descriptors here
    desc = LEGACY_REACH + ISR_KERN_OFFSET;
    desc <<= 16;
    desc &= 0x000000FFFFFF0000;
    ldt[ __SELECTOR_CODE >> 3 ] |= desc;
    ldt[ __SELECTOR_DATA >> 3 ] |= desc;

    // initialize our guest-task's GDTR
    gdt = (unsigned long*)phys_to_virt( g_GDT_region );
    desc = 0x00008B0000000000;
    desc |= (guest_TR_base << 32)&0xFF00000000000000;
    desc |= (guest_TR_base << 16)&0x000000FFFFFF0000;
    desc |= (guest_TR_limit & 0xFFFF);
    gdt[ __SELECTOR_TASK >> 3 ] = desc;
    desc = 0x0000820000000000;
    desc |= ( guest_LDTR_base << 32)&0xFF00000000000000;
    desc |= ( guest_LDTR_base << 16)&0x000000FFFFFF0000;
    desc |= ( guest_LDTR_limit & 0xFFFF );
    gdt[ __SELECTOR_LDTR >> 3 ] = desc;

    // initialize our guest's IDT
    idt = (unsigned long*)phys_to_virt( g_IDT_region );
    desc = 0;        // offset-address for GPF isr
    desc &= 0x00000000FFFFFFFF;
    desc |= (desc << 32);
    desc &= 0xFFFF00000000FFFF;
    desc |= ( __SELECTOR_CODE << 16);
    desc |= 0x00008E0000000000;
    idt[ 13 ] = desc;

    // initialize our guest's Task-State Segment
    tss = (unsigned int*)phys_to_virt( g_TSS_region );
    tss[ 1 ] = TOS_KERN_OFFSET;
    tss[ 2 ] = __SELECTOR_DATA;
    tss[ 25 ] = 0x00880000;
    tss[ guest_TR_limit >> 2 ] = 0xFF;

    //----------------------------------------------------
    // initialize the global variables for the host state
    //----------------------------------------------------
    asm(" mov %%cr0, %%rax \n mov %%rax, host_CR0 " ::: "ax" );
    asm(" mov %%cr4, %%rax \n mov %%rax, host_CR4 " ::: "ax" );
    asm(" mov %%cr3, %%rax \n mov %%rax, host_CR3 " ::: "ax" );
    asm(" str host_TR_selector ");
    asm(" mov %es, host_ES_selector ");    
    asm(" mov %cs, host_CS_selector ");    
    asm(" mov %ss, host_SS_selector ");    
    asm(" mov %ds, host_DS_selector ");    
    asm(" mov %fs, host_FS_selector ");    
    asm(" mov %gs, host_GS_selector ");    
    asm(" sgdt _gdtr \n sidt _idtr ");
    host_GDTR_base = *(unsigned long*)( _gdtr+1 );
    host_IDTR_base = *(unsigned long*)( _idtr+1 );

    gdt = (unsigned long*)host_GDTR_base;
    desc = gdt[ (host_TR_selector >> 3) + 0 ];
    host_TR_base = ((desc >> 16)&0x00FFFFFF)|((desc >> 32)&0xFF000000);
    desc = gdt[ (host_TR_selector >> 3) + 1 ];
    desc <<= 48;    // maneuver to insure 'canonical' address
    host_TR_base |= (desc >> 16)&0xFFFFFFFF00000000;

    asm(    " mov    $0x174, %%ecx            \n"\
        " rdmsr                    \n"\
        " mov    %%eax, host_SYSENTER_CS        \n"\
        " inc    %%ecx                \n"\
        " rdmsr                    \n"\
        " mov    %%eax, host_SYSENTER_ESP+0     \n"\
        " mov    %%edx, host_SYSENTER_ESP+4     \n"\
        " inc    %%ecx                \n"\
        " rdmsr                    \n"\
        " mov    %%eax, host_SYSENTER_EIP+0    \n"\
        " mov    %%edx, host_SYSENTER_EIP+4     \n"\
        ::: "ax", "cx", "dx" );

    asm(    " mov    %0, %%ecx        \n"\
        " rdmsr                \n"\
        " mov    %%eax, host_FS_base+0     \n"\
        " mov    %%edx, host_FS_base+4    \n"\
        :: "i" (0xC0000100) : "ax", "cx", "dx" );

    asm(    " mov    %0, %%ecx        \n"\
        " rdmsr                \n"\
        " mov    %%eax, host_GS_base+0     \n"\
        " mov    %%edx, host_GS_base+4    \n"\
        :: "i" (0xC0000101) : "ax", "cx", "dx" );

    //------------------------------------------------------
    // initialize the global variables for the VMX controls
    //------------------------------------------------------
    control_VMX_pin_based = msr0x480[ 1 ];
    control_VMX_cpu_based = msr0x480[ 2 ];
    control_VM_exit_controls = msr0x480[ 3 ];
    control_VM_entry_controls = msr0x480[ 4 ];
    control_VMX_pin_based |= (1 << 0);    // exit on interrupts    
    control_VMX_pin_based |= (1 << 3);    // NMI-exiting      
    control_VMX_cpu_based |= (1 << 7) | (1 << 29); // Hlt + Monitor exit

    control_pagefault_errorcode_match = 0xFFFFFFFF;
    control_VM_exit_controls |= (1 << 9);    // exit to 64-bit host

    control_CR0_mask = 0x80000021;
    control_CR4_mask = 0x00002000;
    control_CR0_shadow = 0x80000021;
    control_CR4_shadow = 0x00002000;
    control_CR3_target_count = 2;
    control_CR3_target0 = guest_CR3;    // guest's directory
    control_CR3_target1 = host_CR3;        // host's directory

// initialize our counters for NMIs and external interrupts
nmiints = 0;
extints = 0;

    // enable virtual machine extensions (bit 13 in CR4)
    set_CR4_vmxe( NULL );    
    smp_call_function( set_CR4_vmxe, NULL, 1 );

    //---------------------
    // launch the guest VM
    //---------------------
    asm volatile ("    .type  my_vmm, @function    \n"\
        " pushfq                \n"\
        " push    %rax                \n"\
        " push    %rbx                \n"\
        " push    %rcx                \n"\
        " push    %rdx                \n"\
        " push    %rbp                \n"\
        " push    %rsi                \n"\
        " push    %rdi                \n"\
        " push    %r11                \n"\
        "                    \n"\
        " lea    my_vmm, %rax            \n"\
        "                    \n"\
        " mov    %rax, host_RIP            \n"\
        " mov    %rsp, host_RSP            \n"\
        "                    \n"\
        " vmxon    vmxon_region            \n"\
        " jc    fail                \n"\
        " jz    over                \n"\
        "                    \n"\
        " movl    $1, retval            \n"\
        " vmclear guest_region            \n"\
        "                    \n"\
        " movl    $2, retval            \n"\
        " vmptrld guest_region            \n"\
        "                    \n"\
        " movl    $3, retval            \n"\
        "                    \n"\
        " xor    %rdx, %rdx            \n"\
        " mov    elements, %rcx            \n"\
        "nxwr:                    \n"\
        " mov    machine+0(%rdx), %rax        \n"\
        " mov    machine+8(%rdx), %rbx        \n"\
        " vmwrite (%rbx), %rax            \n"\
        " add    $16, %rdx            \n"\
        " loop    nxwr                \n"\
        "                    \n"\
        " movl     $4, retval            \n"\
        " mov    _eax, %eax            \n"\
        " mov    _ebx, %ebx            \n"\
        " mov    _ecx, %ecx            \n"\
        " mov    _edx, %edx            \n"\
        " mov    _ebp, %ebp            \n"\
        " mov    _esi, %esi            \n"\
        " mov    _edi, %edi            \n"\
        "  vmlaunch                \n"\
        " movl     $5, retval            \n"\
        " jmp    read                \n"\
        "my_vmm:                \n"\
        "                    \n"\
        " mov    %eax, _eax            \n"\
        " mov    %ebx, _ebx            \n"\
        " mov    %ecx, _ecx            \n"\
        " mov    %edx, _edx            \n"\
        " mov    %ebp, _ebp            \n"\
        " mov    %esi, _esi            \n"\
        " mov    %edi, _edi            \n"\
        "read:                    \n"\
        " xor    %rdx, %rdx            \n"\
        " mov    rocount, %rcx            \n"\
        "nxrd:                    \n"\
        " mov    results+0(%rdx), %rax        \n"\
        " mov    results+8(%rdx), %rbx        \n"\
        " vmread %rax, (%rbx)            \n"\
        " add    $16, %rdx            \n"\
        " loop    nxrd                \n"\
        "                    \n"\
        " cmpl    $0, info_vmexit_reason        \n"\
        " je    was_nmi                \n"\
        "                    \n"\
        " cmpl    $1, info_vmexit_reason        \n"\
        " je    was_extint            \n"\
        "                    \n"\
        " jmp    over                \n"\
        "                    \n"\
        "was_nmi:                \n"\
        " incl    nmiints                \n"\
/*        " int    $0x02                \n"\
*/        " jmp    resume_guest            \n"\
        "                    \n"\
        "was_extint:                \n"\
        " sti                    \n"\
        " incl    extints                \n"\
        "                    \n"\
        "resume_guest:                \n"\
        " mov    _eax, %eax            \n"\
        " mov    _ebx, %ebx            \n"\
        " mov    _ecx, %ecx            \n"\
        " mov    _edx, %edx            \n"\
        " mov    _ebp, %ebp            \n"\
        " mov    _esi, %esi            \n"\
        " mov    _edi, %edi            \n"\
        "  vmresume                \n"\
        "                    \n"\
        " movl  $-1, retval            \n"\
        "over:                    \n"\
        " vmxoff                \n"\
        "fail:                    \n"\
        " pop    %r11                \n"\
        " pop    %rdi                \n"\
        " pop    %rsi                \n"\
        " pop    %rbp                \n"\
        " pop    %rdx                \n"\
        " pop    %rcx                \n"\
        " pop    %rbx                \n"\
        " pop    %rax                \n"\
        " popfq                    \n"\
        );

    // show why the VMentry failed, or else why the VMexit occurred    
    printk( "\n VM-instruction error: %d  ", info_vminstr_error );
    printk( " Exit Reason: %d \n", info_vmexit_reason );
    printk( " VMexit-interruption-information: %08X \n",
                    info_vmexit_interrupt_information );
    printk( " VMexit-interruption-error-code:  %08X \n",
            info_vmexit_interrupt_error_code  );

    if (retval >= 0) {
        retval = info_vmexit_reason;
    }
    // display the number of external interruption-exits
    printk( "\n" );
    printk( " number of external interrupts = %d \n", extints );
    printk( " number of non-maskable interrupts = %d \n", nmiints );

    // copy the client's virtual-machine register-values
    vm.eflags = (unsigned int)guest_RFLAGS;
    vm.eip = (unsigned int)guest_RIP;    
    vm.esp = (unsigned int)guest_RSP;    
    vm.eax = _eax;
    vm.ebx = _ebx;
    vm.ecx = _ecx;
    vm.edx = _edx;
    vm.ebp = _ebp;
    vm.esi = _esi;
    vm.edi = _edi;
    vm.es  = guest_ES_selector;
    vm.cs  = guest_CS_selector;
    vm.ss  = guest_SS_selector;
    vm.ds  = guest_DS_selector;
    vm.fs  = guest_FS_selector;
    vm.gs  = guest_GS_selector;
    if ( copy_to_user( (void*)buf, &vm, count ) ) return -EFAULT;

    return    retval;
}

int my_release ( struct inode *inode, struct file *file )
{
    pr_info("Calling %s\n", __func__);
/*
    smp_call_function( clear_CR4_vmxe, NULL, 1 );
    clear_CR4_vmxe( NULL );
*/
    retval = 0;
    return 0;
}

By testing above code, with delay.cpp, I found the very first time VMlaunch will fail with VMX instruction error of 8 (invalid host state), all the subsequent VMX operation will be fine.

After debugging it, i found it is related to where to set and clear VMXE (bit13) in CR4.

If setting VMXE bit in init_module, the VM could be launched well every time, no error of 8.

Then, if clearing VMXE bit in my_release, the VMX operation will fail every time (so i commented out that operation).

I must miss something important about the sequence of entering VMX root operation.

My testing environment is VMware WS, and a bare-metal Ubuntu Linux host.
I tested in SMP host, and non-SMP host, got the same result.

linux
x86
virtualization
asked on Stack Overflow Jun 1, 2018 by wangt13 • edited Jun 1, 2018 by jww

0 Answers

Nobody has answered this question yet.


User contributions licensed under CC BY-SA 3.0