/** @file
 * The VirtualBox Support Driver - Linux hosts.
 */

/*
 * Copyright (C) 2006-2007 innotek GmbH
 *
 * This file is part of VirtualBox Open Source Edition (OSE), as
 * available from http://www.virtualbox.org. This file is free software;
 * you can redistribute it and/or modify it under the terms of the GNU
 * General Public License as published by the Free Software Foundation,
 * in version 2 as it comes in the "COPYING" file of the VirtualBox OSE
 * distribution. VirtualBox OSE is distributed in the hope that it will
 * be useful, but WITHOUT ANY WARRANTY of any kind.
 *
 * If you received this file as part of a commercial VirtualBox
 * distribution, then only the terms of your commercial VirtualBox
 * license agreement apply instead of the previous paragraph.
 *
 * Some lines of code to disable the local APIC on x86_64 machines taken
 * from a Mandriva patch by Gwenole Beauchesne <gbeauchesne@mandriva.com>.
 */

/*******************************************************************************
*   Header Files                                                               *
*******************************************************************************/
#include "SUPDRV.h"
#include "version-generated.h"

#include <iprt/assert.h>
#include <iprt/spinlock.h>
#include <iprt/semaphore.h>
#include <iprt/initterm.h>
#include <iprt/process.h>
#include <iprt/err.h>
#include <iprt/mem.h>

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/slab.h>
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 0)
# include <linux/jiffies.h>
#endif
#include <asm/mman.h>
#include <asm/io.h>
#include <asm/uaccess.h>
#ifdef CONFIG_DEVFS_FS
# include <linux/devfs_fs_kernel.h>
#endif
#ifdef CONFIG_VBOXDRV_AS_MISC
# include <linux/miscdevice.h>
#endif
#ifdef CONFIG_X86_LOCAL_APIC
# include <asm/apic.h>
# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 0)
#  include <asm/nmi.h>
# endif
#endif

#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 0)
# ifndef page_to_pfn
#  define page_to_pfn(page) ((page) - mem_map)
# endif
# include <asm/pgtable.h>
# define global_flush_tlb __flush_tlb_global
#endif

/* devfs defines */
#if defined(CONFIG_DEVFS_FS) && !defined(CONFIG_VBOXDRV_AS_MISC)
# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 0)

#  define VBOX_REGISTER_DEVFS()                         \
({                                                      \
    void *rc = NULL;                                    \
    if (devfs_mk_cdev(MKDEV(DEVICE_MAJOR, 0),           \
                  S_IFCHR | S_IRUGO | S_IWUGO,          \
                  DEVICE_NAME) == 0)                    \
        rc = (void *)' '; /* return not NULL */         \
    rc;                                                 \
 })

#  define VBOX_UNREGISTER_DEVFS(handle)                 \
    devfs_remove(DEVICE_NAME);

# else /* < 2.6.0 */

#  define VBOX_REGISTER_DEVFS()                         \
    devfs_register(NULL, DEVICE_NAME, DEVFS_FL_DEFAULT, \
                   DEVICE_MAJOR, 0,                     \
                   S_IFCHR | S_IRUGO | S_IWUGO,         \
                   &gFileOpsVBoxDrv, NULL)

#  define VBOX_UNREGISTER_DEVFS(handle)                 \
    if (handle != NULL)                                 \
        devfs_unregister(handle)

# endif /* < 2.6.0 */
#endif /* CONFIG_DEV_FS && !CONFIG_VBOXDEV_AS_MISC */

#ifndef CONFIG_VBOXDRV_AS_MISC
# if defined(CONFIG_DEVFS_FS) && LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 0)
#  define VBOX_REGISTER_DEVICE(a,b,c)     devfs_register_chrdev(a,b,c)
#  define VBOX_UNREGISTER_DEVICE(a,b)     devfs_unregister_chrdev(a,b)
# else
#  define VBOX_REGISTER_DEVICE(a,b,c)     register_chrdev(a,b,c)
#  define VBOX_UNREGISTER_DEVICE(a,b)     unregister_chrdev(a,b)
# endif
#endif /* !CONFIG_VBOXDRV_AS_MISC */


#ifdef CONFIG_X86_HIGH_ENTRY
# error "CONFIG_X86_HIGH_ENTRY is not supported by VBoxDrv at this time."
#endif

/*
 * This sucks soooo badly on x86! Why don't they export __PAGE_KERNEL_EXEC so PAGE_KERNEL_EXEC would be usable?
 */
#if defined(RT_ARCH_AMD64)
# define MY_PAGE_KERNEL_EXEC    PAGE_KERNEL_EXEC
#elif defined(PAGE_KERNEL_EXEC) && defined(CONFIG_X86_PAE)
# define MY_PAGE_KERNEL_EXEC    __pgprot(cpu_has_pge ? _PAGE_KERNEL_EXEC | _PAGE_GLOBAL : _PAGE_KERNEL_EXEC)
#else
# define MY_PAGE_KERNEL_EXEC    PAGE_KERNEL
#endif

/*
 * The redhat hack section.
 *  - The current hacks are for 2.4.21-15.EL only.
 */
#ifndef NO_REDHAT_HACKS
/* accounting. */
# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 0)
#  ifdef VM_ACCOUNT
#   define MY_DO_MUNMAP(a,b,c) do_munmap(a, b, c, 0) /* should it be 1 or 0? */
#  endif
# endif

/* backported remap_page_range. */
# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 0)
#  include <asm/tlb.h>
#  ifdef tlb_vma /* probably not good enough... */
#   define HAVE_26_STYLE_REMAP_PAGE_RANGE 1
#  endif
# endif

# ifndef RT_ARCH_AMD64
/* In 2.6.9-22.ELsmp we have to call change_page_attr() twice when changing
 * the page attributes from PAGE_KERNEL to something else, because there appears
 * to be a bug in one of the many patches that redhat applied.
 * It should be safe to do this on less buggy linux kernels too. ;-)
 */
#  define MY_CHANGE_PAGE_ATTR(pPages, cPages, prot) \
    do { \
        if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) \
            change_page_attr(pPages, cPages, prot); \
        change_page_attr(pPages, cPages, prot); \
    } while (0)
# endif
#endif /* !NO_REDHAT_HACKS */


#ifndef MY_DO_MUNMAP
# define MY_DO_MUNMAP(a,b,c) do_munmap(a, b, c)
#endif

#ifndef MY_CHANGE_PAGE_ATTR
# ifdef RT_ARCH_AMD64 /** @todo This is a cheap hack, but it'll get around that 'else BUG();' in __change_page_attr().  */
#  define MY_CHANGE_PAGE_ATTR(pPages, cPages, prot) \
    do { \
        change_page_attr(pPages, cPages, PAGE_KERNEL_NOCACHE); \
        change_page_attr(pPages, cPages, prot); \
    } while (0)
# else
#  define MY_CHANGE_PAGE_ATTR(pPages, cPages, prot) change_page_attr(pPages, cPages, prot)
# endif
#endif


/** @def ONE_MSEC_IN_JIFFIES
 * The number of jiffies that make up 1 millisecond. This is only actually used
 * when HZ is > 1000. */
#if HZ <= 1000
# define ONE_MSEC_IN_JIFFIES       0
#elif !(HZ % 1000)
# define ONE_MSEC_IN_JIFFIES       (HZ / 1000)
#else
# define ONE_MSEC_IN_JIFFIES       ((HZ + 999) / 1000)
# error "HZ is not a multiple of 1000, the GIP stuff won't work right!"
#endif

#ifdef CONFIG_X86_LOCAL_APIC

/* If an NMI occurs while we are inside the world switcher the machine will
 * crash. The Linux NMI watchdog generates periodic NMIs increasing a counter
 * which is compared with another counter increased in the timer interrupt
 * handler. We disable the NMI watchdog.
 *
 * - Linux >= 2.6.21: The watchdog is disabled by default on i386 and x86_64.
 * - Linux <  2.6.21: The watchdog is normally enabled by default on x86_64
 *                    and disabled on i386.
 */
# if defined(RT_ARCH_AMD64)
#  if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 21)
#   define DO_DISABLE_NMI 1
#  endif
# endif

# if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 19)
extern int nmi_active;
#  define nmi_atomic_read(P)    *(P)
#  define nmi_atomic_set(P, V)  *(P) = (V)
#  define nmi_atomic_dec(P)     nmi_atomic_set(P, 0)
# else
#  define nmi_atomic_read(P)    atomic_read(P)
#  define nmi_atomic_set(P, V)  atomic_set(P, V)
#  define nmi_atomic_dec(P)     atomic_dec(P)
# endif

# ifndef X86_FEATURE_ARCH_PERFMON
#  define X86_FEATURE_ARCH_PERFMON (3*32+9) /* Intel Architectural PerfMon */
# endif
# ifndef MSR_ARCH_PERFMON_EVENTSEL0
#  define MSR_ARCH_PERFMON_EVENTSEL0 0x186
# endif
# ifndef ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT
#  define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT (1 << 0)
# endif

#endif /* CONFIG_X86_LOCAL_APIC */


/*******************************************************************************
*   Defined Constants And Macros                                               *
*******************************************************************************/
/**
 * Device extention & session data association structure.
 */
static SUPDRVDEVEXT     g_DevExt;

/** Timer structure for the GIP update. */
static struct timer_list g_GipTimer;
/** Pointer to the page structure for the GIP. */
struct page            *g_pGipPage;

/** Registered devfs device handle. */
#if defined(CONFIG_DEVFS_FS) && !defined(CONFIG_VBOXDRV_AS_MISC)
# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 0)
static void            *g_hDevFsVBoxDrv = NULL;
# else
static devfs_handle_t   g_hDevFsVBoxDrv = NULL;
# endif
#endif

#ifndef CONFIG_VBOXDRV_AS_MISC
/** Module major number */
#define DEVICE_MAJOR   234
/** Saved major device number */
static int              g_iModuleMajor;
#endif /* !CONFIG_VBOXDRV_AS_MISC */

/** The module name. */
#define DEVICE_NAME    "vboxdrv"

#ifdef RT_ARCH_AMD64
/**
 * Memory for the executable memory heap (in IPRT).
 */
extern uint8_t g_abExecMemory[1572864]; /* 1.5 MB */
__asm__(".section execmemory, \"awx\", @progbits\n\t"
        ".align 32\n\t"
        ".globl g_abExecMemory\n"
        "g_abExecMemory:\n\t"
        ".zero 1572864\n\t"
        ".type g_abExecMemory, @object\n\t"
        ".size g_abExecMemory, 1572864\n\t"
        ".text\n\t");
#endif


/*******************************************************************************
*   Internal Functions                                                         *
*******************************************************************************/
static int      VBoxSupDrvInit(void);
static void     VBoxSupDrvUnload(void);
static int      VBoxSupDrvCreate(struct inode *pInode, struct file *pFilp);
static int      VBoxSupDrvClose(struct inode *pInode, struct file *pFilp);
static int      VBoxSupDrvDeviceControl(struct inode *pInode, struct file *pFilp,
                                        unsigned int IOCmd, unsigned long IOArg);
static RTR3PTR  VBoxSupDrvMapUser(struct page **papPages, unsigned cPages, unsigned fProt, pgprot_t pgFlags);
static int      VBoxSupDrvInitGip(PSUPDRVDEVEXT pDevExt);
static int      VBoxSupDrvTermGip(PSUPDRVDEVEXT pDevExt);
static void     VBoxSupGipTimer(unsigned long ulUser);
#ifdef CONFIG_SMP
static void     VBoxSupGipTimerPerCpu(unsigned long ulUser);
static void     VBoxSupGipResumePerCpu(void *pvUser);
#endif
static int      VBoxSupDrvOrder(unsigned long size);
static int      VBoxSupDrvErr2LinuxErr(int);


/** The file_operations structure. */
static struct file_operations gFileOpsVBoxDrv =
{
    owner:      THIS_MODULE,
    open:       VBoxSupDrvCreate,
    release:    VBoxSupDrvClose,
    ioctl:      VBoxSupDrvDeviceControl,
};

#ifdef CONFIG_VBOXDRV_AS_MISC
/** The miscdevice structure. */
static struct miscdevice gMiscDevice =
{
    minor:      MISC_DYNAMIC_MINOR,
    name:       DEVICE_NAME,
    fops:       &gFileOpsVBoxDrv,
# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 0) && \
     LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 17)
    devfs_name: DEVICE_NAME,
# endif
};
#endif

#ifdef CONFIG_X86_LOCAL_APIC
# ifdef DO_DISABLE_NMI

/** Stop AMD NMI watchdog (x86_64 only). */
static int stop_k7_watchdog(void)
{
    wrmsr(MSR_K7_EVNTSEL0, 0, 0);
    return 1;
}

/** Stop Intel P4 NMI watchdog (x86_64 only). */
static int stop_p4_watchdog(void)
{
    wrmsr(MSR_P4_IQ_CCCR0,  0, 0);
    wrmsr(MSR_P4_IQ_CCCR1,  0, 0);
    wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
    return 1;
}

/** The new method of detecting the event counter */
static int stop_intel_arch_watchdog(void)
{
    unsigned ebx;

    ebx = cpuid_ebx(10);
    if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
        wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0);
    return 1;
}

/** Stop NMI watchdog. */
static void vbox_stop_apic_nmi_watchdog(void *unused)
{
    int stopped = 0;

    /* only support LOCAL and IO APICs for now */
    if ((nmi_watchdog != NMI_LOCAL_APIC) &&
        (nmi_watchdog != NMI_IO_APIC))
        return;

    if (nmi_watchdog == NMI_LOCAL_APIC)
    {
        switch (boot_cpu_data.x86_vendor)
        {
        case X86_VENDOR_AMD:
            if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
               return;
            stopped = stop_k7_watchdog();
            break;
        case X86_VENDOR_INTEL:
            if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
            {
                stopped = stop_intel_arch_watchdog();
                break;
            }
            stopped = stop_p4_watchdog();
            break;
        default:
            return;
        }
    }

    if (stopped)
        nmi_atomic_dec(&nmi_active);
}

/** Disable LAPIC NMI watchdog. */
static void disable_lapic_nmi_watchdog(void)
{
    BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);

    if (nmi_atomic_read(&nmi_active) <= 0)
        return;

    on_each_cpu(vbox_stop_apic_nmi_watchdog, NULL, 1, 1);

    BUG_ON(nmi_atomic_read(&nmi_active) != 0);

    /* tell do_nmi() and others that we're not active any more */
    nmi_watchdog = NMI_NONE;
}

/** Shutdown NMI. */
static void nmi_cpu_shutdown(void * dummy)
{
    unsigned int vERR, vPC;

    vPC = apic_read(APIC_LVTPC);

    if ((GET_APIC_DELIVERY_MODE(vPC) == APIC_MODE_NMI) && !(vPC & APIC_LVT_MASKED))
    {
        vERR = apic_read(APIC_LVTERR);
        apic_write(APIC_LVTERR, vERR | APIC_LVT_MASKED);
        apic_write(APIC_LVTPC,  vPC  | APIC_LVT_MASKED);
        apic_write(APIC_LVTERR, vERR);
    }
}

static void nmi_shutdown(void)
{
    on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1);
}
# endif /* DO_DISABLE_NMI */
#endif /* CONFIG_X86_LOCAL_APIC */

/**
 * Initialize module.
 *
 * @returns appropriate status code.
 */
static int __init VBoxSupDrvInit(void)
{
    int                 rc;

    dprintf(("VBoxDrv::ModuleInit\n"));

#ifdef CONFIG_X86_LOCAL_APIC
    /*
     * If an NMI occurs while we are inside the world switcher the macine will crash.
     * The Linux NMI watchdog generates periodic NMIs increasing a counter which is
     * compared with another counter increased in the timer interrupt handler. Therefore
     * we don't allow to setup an NMI watchdog.
     */
# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 0)
    /*
     * First test: NMI actiated? Works only works with Linux 2.6 -- 2.4 does not export
     *             the nmi_watchdog variable.
     */
#  if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 19) || \
      (defined CONFIG_X86_64 && LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 0))
#   ifdef DO_DISABLE_NMI
    if (nmi_atomic_read(&nmi_active) > 0)
    {
        printk(KERN_INFO DEVICE_NAME ": Trying to deactivate the NMI watchdog...\n");

        switch (nmi_watchdog)
        {
            case NMI_LOCAL_APIC:
                disable_lapic_nmi_watchdog();
                break;
            case NMI_NONE:
                nmi_atomic_dec(&nmi_active);
                break;
        }

        if (nmi_atomic_read(&nmi_active) == 0)
        {
            nmi_shutdown();
            printk(KERN_INFO DEVICE_NAME ": Successfully done.\n");
        }
        else
            printk(KERN_INFO DEVICE_NAME ": Failed!\n");
    }
#   endif /* DO_DISABLE_NMI */

    /*
     * Permanent IO_APIC mode active? No way to handle this!
     */
    if (nmi_watchdog == NMI_IO_APIC)
    {
        printk(KERN_ERR DEVICE_NAME
               ": NMI watchdog in IO_APIC mode active -- refused to load the kernel module!\n"
                        DEVICE_NAME
               ": Please disable the NMI watchdog by specifying 'nmi_watchdog=0' at kernel\n"
                        DEVICE_NAME
               ": command line.\n");
        return -EINVAL;
    }

    /*
     * See arch/i386/kernel/nmi.c on >= 2.6.19: -1 means it can never enabled again
     */
    nmi_atomic_set(&nmi_active, -1);
    printk(KERN_INFO DEVICE_NAME ": Trying to deactivate the NMI watchdog permanently...\n");

    /*
     * Now fall through and see if it actually was enabled before. If so, fail
     * as we cannot deactivate it cleanly from here.
     */
#  else /* < 2.6.19 */
    /*
     * Older 2.6 kernels: nmi_watchdog is not initalized by default
     */
    if (nmi_watchdog != NMI_NONE)
        goto nmi_activated;
#  endif
# endif /* >= 2.6.0 */

    /*
     * Second test: Interrupt generated by performance counter not masked and can
     *              generate an NMI. Works also with Linux 2.4.
     */
    {
        unsigned int v, ver, maxlvt;

        v   = apic_read(APIC_LVR);
        ver = GET_APIC_VERSION(v);
        /* 82489DXs do not report # of LVT entries. */
        maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2;
        if (maxlvt >= 4)
        {
            /* Read status of performance counter IRQ vector */
            v = apic_read(APIC_LVTPC);

            /* performance counter generates NMI and is not masked? */
            if ((GET_APIC_DELIVERY_MODE(v) == APIC_MODE_NMI) && !(v & APIC_LVT_MASKED))
            {
# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 19) || \
     (defined CONFIG_X86_64 && LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 0))
                printk(KERN_ERR DEVICE_NAME
                ": NMI watchdog either active or at least initialized. Please disable the NMI\n"
                                DEVICE_NAME
                ": watchdog by specifying 'nmi_watchdog=0' at kernel command line.\n");
                return -EINVAL;
# else /* < 2.6.19 */
#  if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 0)
nmi_activated:
#  endif
                printk(KERN_ERR DEVICE_NAME
                ": NMI watchdog active -- refused to load the kernel module! Please disable\n"
                                DEVICE_NAME
                ": the NMI watchdog by specifying 'nmi_watchdog=0' at kernel command line.\n");
                return -EINVAL;
# endif /* >= 2.6.19 */
            }
        }
    }
# if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 19)
    printk(KERN_INFO DEVICE_NAME ": Successfully done.\n");
# endif /* >= 2.6.19 */
#endif /* CONFIG_X86_LOCAL_APIC */

#ifdef CONFIG_VBOXDRV_AS_MISC
    rc = misc_register(&gMiscDevice);
    if (rc)
    {
        printk(KERN_ERR DEVICE_NAME ": Can't register misc device! rc=%d\n", rc);
        return rc;
    }
#else  /* !CONFIG_VBOXDRV_AS_MISC */
    /*
     * Register character device.
     */
    g_iModuleMajor = DEVICE_MAJOR;
    rc = VBOX_REGISTER_DEVICE((dev_t)g_iModuleMajor, DEVICE_NAME, &gFileOpsVBoxDrv);
    if (rc < 0)
    {
        dprintf(("VBOX_REGISTER_DEVICE failed with rc=%#x!\n", rc));
        return rc;
    }

    /*
     * Save returned module major number
     */
    if (DEVICE_MAJOR != 0)
        g_iModuleMajor = DEVICE_MAJOR;
    else
        g_iModuleMajor = rc;
    rc = 0;

#ifdef CONFIG_DEVFS_FS
    /*
     * Register a device entry
     */
    g_hDevFsVBoxDrv = VBOX_REGISTER_DEVFS();
    if (g_hDevFsVBoxDrv == NULL)
    {
        dprintf(("devfs_register failed!\n"));
        rc = -EINVAL;
    }
#endif
#endif /* !CONFIG_VBOXDRV_AS_MISC */
    if (!rc)
    {
        /*
         * Initialize the runtime.
         * On AMD64 we'll have to donate the high rwx memory block to the exec allocator.
         */
        rc = RTR0Init(0);
        if (RT_SUCCESS(rc))
        {
#ifdef RT_ARCH_AMD64
            rc = RTR0MemExecDonate(&g_abExecMemory[0], sizeof(g_abExecMemory));
#endif
            /*
             * Initialize the device extension.
             */
            if (RT_SUCCESS(rc))
                rc = supdrvInitDevExt(&g_DevExt);
            if (!rc)
            {
                /*
                 * Create the GIP page.
                 */
                rc = VBoxSupDrvInitGip(&g_DevExt);
                if (!rc)
                {
                    dprintf(("VBoxDrv::ModuleInit returning %#x\n", rc));
                    return rc;
                }

                supdrvDeleteDevExt(&g_DevExt);
            }
            else
                rc = -EINVAL;
            RTR0Term();
        }
        else
            rc = -EINVAL;

        /*
         * Failed, cleanup and return the error code.
         */
#if defined(CONFIG_DEVFS_FS) && !defined(CONFIG_VBOXDRV_AS_MISC)
        VBOX_UNREGISTER_DEVFS(g_hDevFsVBoxDrv);
#endif
    }
#ifdef CONFIG_VBOXDRV_AS_MISC
    misc_deregister(&gMiscDevice);
    dprintf(("VBoxDrv::ModuleInit returning %#x (minor:%d)\n", rc, gMiscDevice.minor));
#else
    VBOX_UNREGISTER_DEVICE(g_iModuleMajor, DEVICE_NAME);
    dprintf(("VBoxDrv::ModuleInit returning %#x (major:%d)\n", rc, g_iModuleMajor));
#endif
    return rc;
}


/**
 * Unload the module.
 */
static void __exit VBoxSupDrvUnload(void)
{
    int                 rc;
    dprintf(("VBoxSupDrvUnload\n"));

    /*
     * I Don't think it's possible to unload a driver which processes have
     * opened, at least we'll blindly assume that here.
     */
#ifdef CONFIG_VBOXDRV_AS_MISC
    rc = misc_deregister(&gMiscDevice);
    if (rc < 0)
    {
        dprintf(("misc_deregister failed with rc=%#x\n", rc));
    }
#else  /* !CONFIG_VBOXDRV_AS_MISC */
#ifdef CONFIG_DEVFS_FS
    /*
     * Unregister a device entry
     */
    VBOX_UNREGISTER_DEVFS(g_hDevFsVBoxDrv);
#endif // devfs
    rc = VBOX_UNREGISTER_DEVICE(g_iModuleMajor, DEVICE_NAME);
    if (rc < 0)
    {
        dprintf(("unregister_chrdev failed with rc=%#x (major:%d)\n", rc, g_iModuleMajor));
    }
#endif /* !CONFIG_VBOXDRV_AS_MISC */

    /*
     * Destroy GIP, delete the device extension and terminate IPRT.
     */
    VBoxSupDrvTermGip(&g_DevExt);
    supdrvDeleteDevExt(&g_DevExt);
    RTR0Term();
}


/**
 * Device open. Called on open /dev/vboxdrv
 *
 * @param   pInode      Pointer to inode info structure.
 * @param   pFilp       Associated file pointer.
 */
static int VBoxSupDrvCreate(struct inode *pInode, struct file *pFilp)
{
    int                 rc;
    PSUPDRVSESSION      pSession;
    dprintf(("VBoxSupDrvCreate: pFilp=%p\n", pFilp));

    /*
     * Call common code for the rest.
     */
    rc = supdrvCreateSession(&g_DevExt, (PSUPDRVSESSION *)&pSession);
    if (!rc)
    {
        pSession->Uid       = current->euid;
        pSession->Gid       = current->egid;
        pSession->Process   = RTProcSelf();
        pSession->R0Process = RTR0ProcHandleSelf();
    }

    dprintf(("VBoxSupDrvCreate: g_DevExt=%p pSession=%p rc=%d\n", &g_DevExt, pSession, rc));
    pFilp->private_data = pSession;

    return VBoxSupDrvErr2LinuxErr(rc);
}


/**
 * Close device.
 *
 * @param   pInode      Pointer to inode info structure.
 * @param   pFilp       Associated file pointer.
 */
static int VBoxSupDrvClose(struct inode *pInode, struct file *pFilp)
{
    dprintf(("VBoxSupDrvClose: pFilp=%p private_data=%p\n", pFilp, pFilp->private_data));
    supdrvCloseSession(&g_DevExt, (PSUPDRVSESSION)pFilp->private_data);
    pFilp->private_data = NULL;
    return 0;
}


/**
 * Device I/O Control entry point.
 *
 * @param   pInode      Pointer to inode info structure.
 * @param   pFilp       Associated file pointer.
 * @param   IOCmd       The function specified to ioctl().
 * @param   IOArg       The argument specified to ioctl().
 */
static int VBoxSupDrvDeviceControl(struct inode *pInode, struct file *pFilp,
                                   unsigned int IOCmd, unsigned long IOArg)
{
    int                 rc;
    SUPDRVIOCTLDATA     Args;
    void               *pvBuf = NULL;
    int                 cbBuf = 0;
    unsigned            cbOut = 0;

    dprintf2(("VBoxSupDrvDeviceControl: pFilp=%p IOCmd=%x IOArg=%p\n", pFilp, IOCmd, (void *)IOArg));

    /*
     * Copy ioctl data structure from user space.
     */
    if (_IOC_SIZE(IOCmd) != sizeof(SUPDRVIOCTLDATA))
    {
        dprintf(("VBoxSupDrvDeviceControl: incorrect input length! cbArgs=%d\n", _IOC_SIZE(IOCmd)));
        return -EINVAL;
    }
    if (copy_from_user(&Args, (void *)IOArg, _IOC_SIZE(IOCmd)))
    {
        dprintf(("VBoxSupDrvDeviceControl: copy_from_user(&Args) failed.\n"));
        return -EFAULT;
    }

    /*
     * Allocate and copy user space input data buffer to kernel space.
     */
    if (Args.cbIn > 0 || Args.cbOut > 0)
    {
        cbBuf = max(Args.cbIn, Args.cbOut);
        pvBuf = vmalloc(cbBuf);
        if (pvBuf == NULL)
        {
            dprintf(("VBoxSupDrvDeviceControl: failed to allocate buffer of %d bytes.\n", cbBuf));
            return -ENOMEM;
        }

        if (copy_from_user(pvBuf, (void *)Args.pvIn, Args.cbIn))
        {
            dprintf(("VBoxSupDrvDeviceControl: copy_from_user(pvBuf) failed.\n"));
            vfree(pvBuf);
            return -EFAULT;
        }
    }

    /*
     * Process the IOCtl.
     */
    rc = supdrvIOCtl(IOCmd, &g_DevExt, (PSUPDRVSESSION)pFilp->private_data,
                     pvBuf, Args.cbIn, pvBuf, Args.cbOut, &cbOut);

    /*
     * Copy ioctl data and output buffer back to user space.
     */
    if (rc)
    {
        dprintf(("VBoxSupDrvDeviceControl: pFilp=%p IOCmd=%x IOArg=%p failed, rc=%d (linux rc=%d)\n",
                 pFilp, IOCmd, (void *)IOArg, rc, VBoxSupDrvErr2LinuxErr(rc)));
        rc = VBoxSupDrvErr2LinuxErr(rc);
    }
    else if (cbOut > 0)
    {
        if (pvBuf != NULL && cbOut <= cbBuf)
        {
            if (copy_to_user((void *)Args.pvOut, pvBuf, cbOut))
            {
                dprintf(("copy_to_user failed.\n"));
                rc = -EFAULT;
            }
        }
        else
        {
            dprintf(("WHAT!?! supdrvIOCtl messed up! cbOut=%d cbBuf=%d pvBuf=%p\n", cbOut, cbBuf, pvBuf));
            rc = -EPERM;
        }
    }

    if (pvBuf)
        vfree(pvBuf);

    dprintf2(("VBoxSupDrvDeviceControl: returns %d\n", rc));
    return rc;
}


/**
 * Initializes any OS specific object creator fields.
 */
void VBOXCALL   supdrvOSObjInitCreator(PSUPDRVOBJ pObj, PSUPDRVSESSION pSession)
{
    NOREF(pObj);
    NOREF(pSession);
}


/**
 * Checks if the session can access the object.
 *
 * @returns true if a decision has been made.
 * @returns false if the default access policy should be applied.
 *
 * @param   pObj        The object in question.
 * @param   pSession    The session wanting to access the object.
 * @param   pszObjName  The object name, can be NULL.
 * @param   prc         Where to store the result when returning true.
 */
bool VBOXCALL   supdrvOSObjCanAccess(PSUPDRVOBJ pObj, PSUPDRVSESSION pSession, const char *pszObjName, int *prc)
{
    NOREF(pObj);
    NOREF(pSession);
    NOREF(pszObjName);
    NOREF(prc);
    return false;
}


/**
 * Compute order. Some functions allocate 2^order pages.
 *
 * @returns order.
 * @param   cPages      Number of pages.
 */
static int VBoxSupDrvOrder(unsigned long cPages)
{
    int             iOrder;
    unsigned long   cTmp;

    for (iOrder = 0, cTmp = cPages; cTmp >>= 1; ++iOrder)
        ;
    if (cPages & ~(1 << iOrder))
        ++iOrder;

    return iOrder;
}


/**
 * OS Specific code for locking down memory.
 *
 * @returns 0 on success.
 * @returns SUPDRV_ERR_* on failure.
 * @param   pMem        Pointer to memory.
 *                      This is not linked in anywhere.
 * @param   paPages     Array which should be filled with the address of the physical pages.
 *
 * @remark  See sgl_map_user_pages() for an example of an similar function.
 */
int  VBOXCALL   supdrvOSLockMemOne(PSUPDRVMEMREF pMem, PSUPPAGE paPages)
{
    int         rc;
    struct page **papPages;
    unsigned    iPage;
    unsigned    cPages = pMem->cb >> PAGE_SHIFT;
    unsigned long pv = (unsigned long)pMem->pvR3;
    struct vm_area_struct **papVMAs;

    /*
     * Allocate page pointer array.
     */
    papPages = vmalloc(cPages * sizeof(*papPages));
    if (!papPages)
        return SUPDRV_ERR_NO_MEMORY;

    /*
     * Allocate the VMA pointer array.
     */
    papVMAs = vmalloc(cPages * sizeof(*papVMAs));
    if (!papVMAs)
        return SUPDRV_ERR_NO_MEMORY;

    /*
     * Get user pages.
     */
    down_read(&current->mm->mmap_sem);
    rc = get_user_pages(current,                /* Task for fault acounting. */
                        current->mm,            /* Whose pages. */
                        (unsigned long)pv,      /* Where from. */
                        cPages,                 /* How many pages. */
                        1,                      /* Write to memory. */
                        0,                      /* force. */
                        papPages,               /* Page array. */
                        papVMAs);               /* vmas */
    if (rc != cPages)
    {
        up_read(&current->mm->mmap_sem);
        dprintf(("supdrvOSLockMemOne: get_user_pages failed. rc=%d\n", rc));
        return SUPDRV_ERR_LOCK_FAILED;
    }

    for (iPage = 0; iPage < cPages; iPage++)
        flush_dcache_page(papPages[iPage]);
    up_read(&current->mm->mmap_sem);

    pMem->u.locked.papPages = papPages;
    pMem->u.locked.cPages = cPages;

    /*
     * Get addresses, protect against fork()
     */
    for (iPage = 0; iPage < cPages; iPage++)
    {
        paPages[iPage].Phys = page_to_phys(papPages[iPage]);
        paPages[iPage].uReserved = 0;
        papVMAs[iPage]->vm_flags |= VM_DONTCOPY;
    }

    vfree(papVMAs);

    dprintf2(("supdrvOSLockMemOne: pvR3=%p cb=%d papPages=%p\n",
              pMem->pvR3, pMem->cb, pMem->u.locked.papPages));
    return 0;
}


/**
 * Unlocks the memory pointed to by pv.
 *
 * @param   pMem  Pointer to memory to unlock.
 *
 * @remark  See sgl_unmap_user_pages() for an example of an similar function.
 */
void VBOXCALL supdrvOSUnlockMemOne(PSUPDRVMEMREF pMem)
{
    unsigned    iPage;
    dprintf2(("supdrvOSUnlockMemOne: pvR3=%p cb=%d papPages=%p\n",
              pMem->pvR3, pMem->cb, pMem->u.locked.papPages));

    /*
     * Loop thru the pages and release them.
     */
    for (iPage = 0; iPage < pMem->u.locked.cPages; iPage++)
    {
        if (!PageReserved(pMem->u.locked.papPages[iPage]))
            SetPageDirty(pMem->u.locked.papPages[iPage]);
        page_cache_release(pMem->u.locked.papPages[iPage]);
    }

    /* free the page array */
    vfree(pMem->u.locked.papPages);
    pMem->u.locked.cPages = 0;
}


/**
 * OS Specific code for allocating page aligned memory with continuous fixed
 * physical paged backing.
 *
 * @returns 0 on success.
 * @returns SUPDRV_ERR_* on failure.
 * @param   pMem        Memory reference record of the memory to be allocated.
 *                      (This is not linked in anywhere.)
 * @param   ppvR0       Where to store the virtual address of the ring-0 mapping. (optional)
 * @param   ppvR3       Where to store the virtual address of the ring-3 mapping.
 * @param   pHCPhys     Where to store the physical address.
 */
int VBOXCALL supdrvOSContAllocOne(PSUPDRVMEMREF pMem, PRTR0PTR ppvR0, PRTR3PTR ppvR3, PRTHCPHYS pHCPhys)
{
    struct page *paPages;
    unsigned    iPage;
    unsigned    cbAligned = RT_ALIGN(pMem->cb, PAGE_SIZE);
    unsigned    cPages = cbAligned >> PAGE_SHIFT;
    unsigned    cOrder = VBoxSupDrvOrder(cPages);
    unsigned long ulAddr;
    dma_addr_t  HCPhys;
    int         rc = 0;
    pgprot_t    pgFlags;
    pgprot_val(pgFlags) = _PAGE_PRESENT | _PAGE_RW | _PAGE_USER;

    Assert(ppvR3);
    Assert(pHCPhys);

    /*
     * Allocate page pointer array.
     */
#ifdef RT_ARCH_AMD64 /** @todo check out if there is a correct way of getting memory below 4GB (physically). */
    paPages = alloc_pages(GFP_DMA, cOrder);
#else
    paPages = alloc_pages(GFP_USER, cOrder);
#endif
    if (!paPages)
        return SUPDRV_ERR_NO_MEMORY;

    /*
     * Lock the pages.
     */
    for (iPage = 0; iPage < cPages; iPage++)
    {
        SetPageReserved(&paPages[iPage]);
        if (!PageHighMem(&paPages[iPage]) && pgprot_val(MY_PAGE_KERNEL_EXEC) != pgprot_val(PAGE_KERNEL))
            MY_CHANGE_PAGE_ATTR(&paPages[iPage], 1, MY_PAGE_KERNEL_EXEC);
#ifdef DEBUG
        if (iPage + 1 < cPages && (page_to_phys((&paPages[iPage])) + 0x1000) != page_to_phys((&paPages[iPage + 1])))
        {
            dprintf(("supdrvOSContAllocOne: Pages are not continuous!!!! iPage=%d phys=%llx physnext=%llx\n",
                     iPage, (long long)page_to_phys((&paPages[iPage])), (long long)page_to_phys((&paPages[iPage + 1]))));
            BUG();
        }
#endif
    }
    HCPhys = page_to_phys(paPages);

    /*
     * Allocate user space mapping and put the physical pages into it.
     */
    down_write(&current->mm->mmap_sem);
    ulAddr = do_mmap(NULL, 0, cbAligned, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, 0);
    if (!(ulAddr & ~PAGE_MASK))
    {
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 0) && !defined(HAVE_26_STYLE_REMAP_PAGE_RANGE)
        int rc2 = remap_page_range(ulAddr, HCPhys, cbAligned, pgFlags);
#else
        int rc2 = 0;
        struct vm_area_struct *vma = find_vma(current->mm, ulAddr);
        if (vma)
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 11)
            rc2 = remap_page_range(vma, ulAddr, HCPhys, cbAligned, pgFlags);
#else
            rc2 = remap_pfn_range(vma, ulAddr, HCPhys >> PAGE_SHIFT, cbAligned, pgFlags);
#endif
        else
        {
            rc = SUPDRV_ERR_NO_MEMORY;
            dprintf(("supdrvOSContAllocOne: no vma found for ulAddr=%#lx!\n", ulAddr));
        }
#endif
        if (rc2)
        {
            rc = SUPDRV_ERR_NO_MEMORY;
            dprintf(("supdrvOSContAllocOne: remap_page_range failed rc2=%d\n", rc2));
        }
    }
    else
    {
        dprintf(("supdrvOSContAllocOne: do_mmap failed ulAddr=%#lx\n", ulAddr));
        rc = SUPDRV_ERR_NO_MEMORY;
    }
    up_write(&current->mm->mmap_sem);   /* not quite sure when to give this up. */

    /*
     * Success?
     */
    if (!rc)
    {
        *pHCPhys = HCPhys;
        *ppvR3 = ulAddr;
        if (ppvR0)
            *ppvR0 = (void *)ulAddr;
        pMem->pvR3              = ulAddr;
        pMem->pvR0              = NULL;
        pMem->u.cont.paPages    = paPages;
        pMem->u.cont.cPages     = cPages;
        pMem->cb                = cbAligned;

        dprintf2(("supdrvOSContAllocOne: pvR0=%p pvR3=%p cb=%d paPages=%p *pHCPhys=%lx *ppvR0=*ppvR3=%p\n",
                  pMem->pvR0, pMem->pvR3, pMem->cb, paPages, (unsigned long)*pHCPhys, *ppvR3));
        global_flush_tlb();
        return 0;
    }

    /*
     * Failure, cleanup and be gone.
     */
    down_write(&current->mm->mmap_sem);
    if (ulAddr & ~PAGE_MASK)
        MY_DO_MUNMAP(current->mm, ulAddr, pMem->cb);
    for (iPage = 0; iPage < cPages; iPage++)
    {
        ClearPageReserved(&paPages[iPage]);
        if (!PageHighMem(&paPages[iPage]) && pgprot_val(MY_PAGE_KERNEL_EXEC) != pgprot_val(PAGE_KERNEL))
            MY_CHANGE_PAGE_ATTR(&paPages[iPage], 1, PAGE_KERNEL);
    }
    up_write(&current->mm->mmap_sem);   /* check when we can leave this. */
    __free_pages(paPages, cOrder);

    global_flush_tlb();
    return rc;
}


/**
 * Frees contiguous memory.
 *
 * @param   pMem    Memory reference record of the memory to be freed.
 */
void VBOXCALL supdrvOSContFreeOne(PSUPDRVMEMREF pMem)
{
    unsigned    iPage;

    dprintf2(("supdrvOSContFreeOne: pvR0=%p pvR3=%p cb=%d paPages=%p\n",
              pMem->pvR0, pMem->pvR3, pMem->cb, pMem->u.cont.paPages));

    /*
     * do_exit() destroys the mm before closing files.
     * I really hope it cleans up our stuff properly...
     */
    if (current->mm)
    {
        down_write(&current->mm->mmap_sem);
        MY_DO_MUNMAP(current->mm, (unsigned long)pMem->pvR3, pMem->cb);
        up_write(&current->mm->mmap_sem);   /* check when we can leave this. */
    }

    /*
     * Change page attributes freeing the pages.
     */
    for (iPage = 0; iPage < pMem->u.cont.cPages; iPage++)
    {
        ClearPageReserved(&pMem->u.cont.paPages[iPage]);
        if (!PageHighMem(&pMem->u.cont.paPages[iPage]) && pgprot_val(MY_PAGE_KERNEL_EXEC) != pgprot_val(PAGE_KERNEL))
            MY_CHANGE_PAGE_ATTR(&pMem->u.cont.paPages[iPage], 1, PAGE_KERNEL);
    }
    __free_pages(pMem->u.cont.paPages, VBoxSupDrvOrder(pMem->u.cont.cPages));

    pMem->u.cont.cPages = 0;
}


/**
 * Allocates memory which mapped into both kernel and user space.
 * The returned memory is page aligned and so is the allocation.
 *
 * @returns 0 on success.
 * @returns SUPDRV_ERR_* on failure.
 * @param   pMem        Memory reference record of the memory to be allocated.
 *                      (This is not linked in anywhere.)
 * @param   ppvR0       Where to store the address of the Ring-0 mapping.
 * @param   ppvR3       Where to store the address of the Ring-3 mapping.
 */
int  VBOXCALL   supdrvOSMemAllocOne(PSUPDRVMEMREF pMem, PRTR0PTR ppvR0, PRTR3PTR ppvR3)
{
    const unsigned  cbAligned = RT_ALIGN(pMem->cb, PAGE_SIZE);
    const unsigned  cPages = cbAligned >> PAGE_SHIFT;
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 4, 22)
    unsigned        cOrder = VBoxSupDrvOrder(cPages);
    struct page    *paPages;
#endif
    struct page   **papPages;
    unsigned        iPage;
    pgprot_t        pgFlags;
    pgprot_val(pgFlags) = _PAGE_PRESENT | _PAGE_RW | _PAGE_USER;

    /*
     * Allocate array with page pointers.
     */
    pMem->u.mem.cPages = 0;
    pMem->u.mem.papPages = papPages = kmalloc(sizeof(papPages[0]) * cPages, GFP_KERNEL);
    if (!papPages)
        return SUPDRV_ERR_NO_MEMORY;

    /*
     * Allocate the pages.
     */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 22)
    for (iPage = 0; iPage < cPages; iPage++)
    {
        papPages[iPage] = alloc_page(GFP_HIGHUSER);
        if (!papPages[iPage])
        {
            pMem->u.mem.cPages = iPage;
            supdrvOSMemFreeOne(pMem);
            return SUPDRV_ERR_NO_MEMORY;
        }
    }

#else /* < 2.4.22 */
    paPages = alloc_pages(GFP_USER, cOrder);
    if (!paPages)
    {
        supdrvOSMemFreeOne(pMem);
        return SUPDRV_ERR_NO_MEMORY;
    }
    for (iPage = 0; iPage < cPages; iPage++)
    {
        papPages[iPage] = &paPages[iPage];
        if (pgprot_val(MY_PAGE_KERNEL_EXEC) != pgprot_val(PAGE_KERNEL))
            MY_CHANGE_PAGE_ATTR(papPages[iPage], 1, MY_PAGE_KERNEL_EXEC);
        if (PageHighMem(papPages[iPage]))
            BUG();
    }
#endif
    pMem->u.mem.cPages = cPages;

    /*
     * Reserve the pages.
     */
    for (iPage = 0; iPage < cPages; iPage++)
        SetPageReserved(papPages[iPage]);

    /*
     * Create the Ring-0 mapping.
     */
    if (ppvR0)
    {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 22)
# ifdef VM_MAP
        *ppvR0 = pMem->pvR0 = vmap(papPages, cPages, VM_MAP, pgFlags);
# else
        *ppvR0 = pMem->pvR0 = vmap(papPages, cPages, VM_ALLOC, pgFlags);
# endif
#else
        *ppvR0 = pMem->pvR0 = phys_to_virt(page_to_phys(papPages[0]));
#endif
    }
    if (pMem->pvR0 || !ppvR0)
    {
        /*
         * Create the ring3 mapping.
         */
        if (ppvR3)
            *ppvR3 = pMem->pvR3 = VBoxSupDrvMapUser(papPages, cPages, PROT_READ | PROT_WRITE | PROT_EXEC, pgFlags);
        if (pMem->pvR3 || !ppvR3)
            return 0;
        dprintf(("supdrvOSMemAllocOne: failed to map into r3! cPages=%u\n", cPages));
    }
    else
        dprintf(("supdrvOSMemAllocOne: failed to map into r0! cPages=%u\n", cPages));

    supdrvOSMemFreeOne(pMem);
    return SUPDRV_ERR_NO_MEMORY;
}


/**
 * Get the physical addresses of the pages in the allocation.
 * This is called while inside bundle the spinlock.
 *
 * @param   pMem        Memory reference record of the memory.
 * @param   paPages     Where to store the page addresses.
 */
void VBOXCALL   supdrvOSMemGetPages(PSUPDRVMEMREF pMem, PSUPPAGE paPages)
{
    unsigned iPage;
    for (iPage = 0; iPage < pMem->u.mem.cPages; iPage++)
    {
        paPages[iPage].Phys = page_to_phys(pMem->u.mem.papPages[iPage]);
        paPages[iPage].uReserved = 0;
    }
}


/**
 * Frees memory allocated by supdrvOSMemAllocOne().
 *
 * @param   pMem        Memory reference record of the memory to be free.
 */
void VBOXCALL   supdrvOSMemFreeOne(PSUPDRVMEMREF pMem)
{
    dprintf2(("supdrvOSMemFreeOne: pvR0=%p pvR3=%p cb=%d cPages=%d papPages=%p\n",
              pMem->pvR0, pMem->pvR3, pMem->cb, pMem->u.mem.cPages, pMem->u.mem.papPages));

    /*
     * Unmap the user mapping (if any).
     * do_exit() destroys the mm before closing files.
     */
    if (pMem->pvR3 && current->mm)
    {
        down_write(&current->mm->mmap_sem);
        MY_DO_MUNMAP(current->mm, (unsigned long)pMem->pvR3, RT_ALIGN(pMem->cb, PAGE_SIZE));
        up_write(&current->mm->mmap_sem);   /* check when we can leave this. */
    }
    pMem->pvR3 = NIL_RTR3PTR;

    /*
     * Unmap the kernel mapping (if any).
     */
    if (pMem->pvR0)
    {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 22)
        vunmap(pMem->pvR0);
#endif
        pMem->pvR0 = NULL;
    }

    /*
     * Free the physical pages.
     */
    if (pMem->u.mem.papPages)
    {
        struct page   **papPages = pMem->u.mem.papPages;
        const unsigned  cPages   = pMem->u.mem.cPages;
        unsigned        iPage;

        /* Restore the page flags. */
        for (iPage = 0; iPage < cPages; iPage++)
        {
            ClearPageReserved(papPages[iPage]);
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 4, 22)
            if (pgprot_val(MY_PAGE_KERNEL_EXEC) != pgprot_val(PAGE_KERNEL))
                MY_CHANGE_PAGE_ATTR(papPages[iPage], 1, PAGE_KERNEL);
#endif
        }

        /* Free the pages. */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 4, 22)
        for (iPage = 0; iPage < pMem->u.cont.cPages; iPage++)
            __free_page(papPages[iPage]);
#else
        if (cPages > 0)
            __free_pages(papPages[0], VBoxSupDrvOrder(cPages));
#endif
        /* Free the page pointer array. */
        kfree(papPages);
        pMem->u.mem.papPages = NULL;
    }
    pMem->u.mem.cPages = 0;
}


/**
 * Maps a range of pages into user space.
 *
 * @returns Pointer to the user space mapping on success.
 * @returns NULL on failure.
 * @param   papPages    Array of the pages to map.
 * @param   cPages      Number of pages to map.
 * @param   fProt       The mapping protection.
 * @param   pgFlags     The page level protection.
 */
static RTR3PTR VBoxSupDrvMapUser(struct page **papPages, unsigned cPages, unsigned fProt, pgprot_t pgFlags)
{
    int             rc = SUPDRV_ERR_NO_MEMORY;
    unsigned long   ulAddr;

    /*
     * Allocate user space mapping.
     */
    down_write(&current->mm->mmap_sem);
    ulAddr = do_mmap(NULL, 0, cPages * PAGE_SIZE, fProt, MAP_SHARED | MAP_ANONYMOUS, 0);
    if (!(ulAddr & ~PAGE_MASK))
    {
        /*
         * Map page by page into the mmap area.
         * This is generic, paranoid and not very efficient.
         */
        int             rc = 0;
        unsigned long   ulAddrCur = ulAddr;
        unsigned        iPage;
        for (iPage = 0; iPage < cPages; iPage++, ulAddrCur += PAGE_SIZE)
        {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 0) || defined(HAVE_26_STYLE_REMAP_PAGE_RANGE)
            struct vm_area_struct *vma = find_vma(current->mm, ulAddrCur);
            if (!vma)
                break;
#endif

#if   LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 11)
            rc = remap_pfn_range(vma, ulAddrCur, page_to_pfn(papPages[iPage]), PAGE_SIZE, pgFlags);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 0) || defined(HAVE_26_STYLE_REMAP_PAGE_RANGE)
            rc = remap_page_range(vma, ulAddrCur, page_to_phys(papPages[iPage]), PAGE_SIZE, pgFlags);
#else /* 2.4 */
            rc = remap_page_range(ulAddrCur, page_to_phys(papPages[iPage]), PAGE_SIZE, pgFlags);
#endif
            if (rc)
                break;
        }

        /*
         * Successful?
         */
        if (iPage >= cPages)
        {
            up_write(&current->mm->mmap_sem);
            return ulAddr;
        }

        /* no, cleanup! */
        if (rc)
            dprintf(("VBoxSupDrvMapUser: remap_[page|pfn]_range failed! rc=%d\n", rc));
        else
            dprintf(("VBoxSupDrvMapUser: find_vma failed!\n"));

        MY_DO_MUNMAP(current->mm, ulAddr, cPages << PAGE_SHIFT);
    }
    else
    {
        dprintf(("supdrvOSContAllocOne: do_mmap failed ulAddr=%#lx\n", ulAddr));
        rc = SUPDRV_ERR_NO_MEMORY;
    }
    up_write(&current->mm->mmap_sem);

    return NIL_RTR3PTR;
}


/**
 * Initializes the GIP.
 *
 * @returns negative errno.
 * @param   pDevExt     Instance data. GIP stuff may be updated.
 */
static int VBoxSupDrvInitGip(PSUPDRVDEVEXT pDevExt)
{
    struct page *pPage;
    dma_addr_t  HCPhys;
    PSUPGLOBALINFOPAGE pGip;
#ifdef CONFIG_SMP
    unsigned i;
#endif
    dprintf(("VBoxSupDrvInitGip:\n"));

    /*
     * Allocate the page.
     */
    pPage = alloc_pages(GFP_USER, 0);
    if (!pPage)
    {
        dprintf(("VBoxSupDrvInitGip: failed to allocate the GIP page\n"));
        return -ENOMEM;
    }

    /*
     * Lock the page.
     */
    SetPageReserved(pPage);
    g_pGipPage = pPage;

    /*
     * Call common initialization routine.
     */
    HCPhys = page_to_phys(pPage);
    pGip = (PSUPGLOBALINFOPAGE)page_address(pPage);
    pDevExt->ulLastJiffies  = jiffies;
#ifdef TICK_NSEC
    pDevExt->u64LastMonotime = (uint64_t)pDevExt->ulLastJiffies * TICK_NSEC;
    dprintf(("VBoxSupDrvInitGIP: TICK_NSEC=%ld HZ=%d jiffies=%ld now=%lld\n",
             TICK_NSEC, HZ, pDevExt->ulLastJiffies, pDevExt->u64LastMonotime));
#else
    pDevExt->u64LastMonotime = (uint64_t)pDevExt->ulLastJiffies * (1000000 / HZ);
    dprintf(("VBoxSupDrvInitGIP: TICK_NSEC=%d HZ=%d jiffies=%ld now=%lld\n",
             (int)(1000000 / HZ), HZ, pDevExt->ulLastJiffies, pDevExt->u64LastMonotime));
#endif
    supdrvGipInit(pDevExt, pGip, HCPhys, pDevExt->u64LastMonotime,
                  HZ <= 1000 ? HZ : 1000);

    /*
     * Initialize the timer.
     */
    init_timer(&g_GipTimer);
    g_GipTimer.data = (unsigned long)pDevExt;
    g_GipTimer.function = VBoxSupGipTimer;
    g_GipTimer.expires = jiffies;
#ifdef CONFIG_SMP
    for (i = 0; i < RT_ELEMENTS(pDevExt->aCPUs); i++)
    {
        pDevExt->aCPUs[i].u64LastMonotime = pDevExt->u64LastMonotime;
        pDevExt->aCPUs[i].ulLastJiffies   = pDevExt->ulLastJiffies;
        pDevExt->aCPUs[i].iSmpProcessorId = -512;
        init_timer(&pDevExt->aCPUs[i].Timer);
        pDevExt->aCPUs[i].Timer.data      = i;
        pDevExt->aCPUs[i].Timer.function  = VBoxSupGipTimerPerCpu;
        pDevExt->aCPUs[i].Timer.expires   = jiffies;
    }
#endif

    return 0;
}


/**
 * Terminates the GIP.
 *
 * @returns negative errno.
 * @param   pDevExt     Instance data. GIP stuff may be updated.
 */
static int VBoxSupDrvTermGip(PSUPDRVDEVEXT pDevExt)
{
    struct page *pPage;
    PSUPGLOBALINFOPAGE pGip;
#ifdef CONFIG_SMP
    unsigned i;
#endif
    dprintf(("VBoxSupDrvTermGip:\n"));

    /*
     * Delete the timer if it's pending.
     */
    if (timer_pending(&g_GipTimer))
        del_timer_sync(&g_GipTimer);
#ifdef CONFIG_SMP
    for (i = 0; i < RT_ELEMENTS(pDevExt->aCPUs); i++)
        if (timer_pending(&pDevExt->aCPUs[i].Timer))
            del_timer_sync(&pDevExt->aCPUs[i].Timer);
#endif

    /*
     * Uninitialize the content.
     */
    pGip = pDevExt->pGip;
    pDevExt->pGip = NULL;
    if (pGip)
        supdrvGipTerm(pGip);

    /*
     * Free the page.
     */
    pPage = g_pGipPage;
    g_pGipPage = NULL;
    if (pPage)
    {
        ClearPageReserved(pPage);
        __free_pages(pPage, 0);
    }

    return 0;
}

/**
 * Timer callback function.
 *
 * In ASYNC TSC mode this is called on the primary CPU, and we're
 * assuming that the CPU remains online.
 *
 * @param   ulUser  The device extension pointer.
 */
static void     VBoxSupGipTimer(unsigned long ulUser)
{
    PSUPDRVDEVEXT       pDevExt;
    PSUPGLOBALINFOPAGE  pGip;
    unsigned long       ulNow;
    unsigned long       ulDiff;
    uint64_t            u64Monotime;
    unsigned long       SavedFlags;

    local_irq_save(SavedFlags);

    pDevExt = (PSUPDRVDEVEXT)ulUser;
    pGip    = pDevExt->pGip;
    ulNow   = jiffies;

#ifdef CONFIG_SMP
    if (pGip && pGip->u32Mode == SUPGIPMODE_ASYNC_TSC)
    {
        uint8_t iCPU = ASMGetApicId();
        ulDiff = ulNow - pDevExt->aCPUs[iCPU].ulLastJiffies;
        pDevExt->aCPUs[iCPU].ulLastJiffies = ulNow;
#ifdef TICK_NSEC
        u64Monotime = pDevExt->aCPUs[iCPU].u64LastMonotime + ulDiff * TICK_NSEC;
#else
        u64Monotime = pDevExt->aCPUs[iCPU].u64LastMonotime + ulDiff * (1000000 / HZ);
#endif
        pDevExt->aCPUs[iCPU].u64LastMonotime = u64Monotime;
    }
    else
#endif /* CONFIG_SMP */
    {
        ulDiff = ulNow - pDevExt->ulLastJiffies;
        pDevExt->ulLastJiffies = ulNow;
#ifdef TICK_NSEC
        u64Monotime = pDevExt->u64LastMonotime + ulDiff * TICK_NSEC;
#else
        u64Monotime = pDevExt->u64LastMonotime + ulDiff * (1000000 / HZ);
#endif
        pDevExt->u64LastMonotime = u64Monotime;
    }
    if (RT_LIKELY(pGip))
        supdrvGipUpdate(pDevExt->pGip, u64Monotime);
    if (RT_LIKELY(!pDevExt->fGIPSuspended))
        mod_timer(&g_GipTimer, ulNow + (HZ <= 1000 ? 0 : ONE_MSEC_IN_JIFFIES));

    local_irq_restore(SavedFlags);
}


#ifdef CONFIG_SMP
/**
 * Timer callback function for the other CPUs.
 *
 * @param   iTimerCPU     The APIC ID of this timer.
 */
static void VBoxSupGipTimerPerCpu(unsigned long iTimerCPU)
{
    PSUPDRVDEVEXT       pDevExt;
    PSUPGLOBALINFOPAGE  pGip;
    uint8_t             iCPU;
    uint64_t            u64Monotime;
    unsigned long       SavedFlags;

    local_irq_save(SavedFlags);

    pDevExt = &g_DevExt;
    pGip    = pDevExt->pGip;
    iCPU    = ASMGetApicId();

    if (RT_LIKELY(iCPU < RT_ELEMENTS(pGip->aCPUs)))
    {
        if (RT_LIKELY(iTimerCPU == iCPU))
        {
            unsigned long   ulNow  = jiffies;
            unsigned long   ulDiff = ulNow - pDevExt->aCPUs[iCPU].ulLastJiffies;
            pDevExt->aCPUs[iCPU].ulLastJiffies = ulNow;
#ifdef TICK_NSEC
            u64Monotime = pDevExt->aCPUs[iCPU].u64LastMonotime + ulDiff * TICK_NSEC;
#else
            u64Monotime = pDevExt->aCPUs[iCPU].u64LastMonotime + ulDiff * (1000000 / HZ);
#endif
            pDevExt->aCPUs[iCPU].u64LastMonotime = u64Monotime;
            if (RT_LIKELY(pGip))
                supdrvGipUpdatePerCpu(pGip, u64Monotime, iCPU);
            if (RT_LIKELY(!pDevExt->fGIPSuspended))
                mod_timer(&pDevExt->aCPUs[iCPU].Timer, ulNow + (HZ <= 1000 ? 0 : ONE_MSEC_IN_JIFFIES));
        }
        else
            printk("vboxdrv: error: GIP CPU update timer executing on the wrong CPU: apicid=%d != timer-apicid=%ld (cpuid=%d !=? timer-cpuid=%d)\n",
                   iCPU, iTimerCPU, smp_processor_id(), pDevExt->aCPUs[iTimerCPU].iSmpProcessorId);
    }
    else
        printk("vboxdrv: error: APIC ID is bogus (GIP CPU update): apicid=%d max=%lu cpuid=%d\n",
               iCPU, (unsigned long)RT_ELEMENTS(pGip->aCPUs), smp_processor_id());

    local_irq_restore(SavedFlags);
}
#endif  /* CONFIG_SMP */


/**
 * Maps the GIP into user space.
 *
 * @returns negative errno.
 * @param   pDevExt     Instance data.
 */
int VBOXCALL supdrvOSGipMap(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE *ppGip)
{
    int             rc = 0;
    unsigned long   ulAddr;
    unsigned long   HCPhys = pDevExt->HCPhysGip;
    pgprot_t        pgFlags;
    pgprot_val(pgFlags) = _PAGE_PRESENT | _PAGE_USER;
    dprintf2(("supdrvOSGipMap: ppGip=%p\n", ppGip));

    /*
     * Allocate user space mapping and put the physical pages into it.
     */
    down_write(&current->mm->mmap_sem);
    ulAddr = do_mmap(NULL, 0, PAGE_SIZE, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, 0);
    if (!(ulAddr & ~PAGE_MASK))
    {
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 0) && !defined(HAVE_26_STYLE_REMAP_PAGE_RANGE)
        int rc2 = remap_page_range(ulAddr, HCPhys, PAGE_SIZE, pgFlags);
#else
        int rc2 = 0;
        struct vm_area_struct *vma = find_vma(current->mm, ulAddr);
        if (vma)
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 11)
            rc2 = remap_page_range(vma, ulAddr, HCPhys, PAGE_SIZE, pgFlags);
#else
            rc2 = remap_pfn_range(vma, ulAddr, HCPhys >> PAGE_SHIFT, PAGE_SIZE, pgFlags);
#endif
        else
        {
            rc = SUPDRV_ERR_NO_MEMORY;
            dprintf(("supdrvOSGipMap: no vma found for ulAddr=%#lx!\n", ulAddr));
        }
#endif
        if (rc2)
        {
            rc = SUPDRV_ERR_NO_MEMORY;
            dprintf(("supdrvOSGipMap: remap_page_range failed rc2=%d\n", rc2));
        }
    }
    else
    {
        dprintf(("supdrvOSGipMap: do_mmap failed ulAddr=%#lx\n", ulAddr));
        rc = SUPDRV_ERR_NO_MEMORY;
    }
    up_write(&current->mm->mmap_sem);   /* not quite sure when to give this up. */

    /*
     * Success?
     */
    if (!rc)
    {
        *ppGip = (PSUPGLOBALINFOPAGE)ulAddr;
        dprintf2(("supdrvOSGipMap: ppGip=%p\n", *ppGip));
        return 0;
    }

    /*
     * Failure, cleanup and be gone.
     */
    if (ulAddr & ~PAGE_MASK)
    {
        down_write(&current->mm->mmap_sem);
        MY_DO_MUNMAP(current->mm, ulAddr, PAGE_SIZE);
        up_write(&current->mm->mmap_sem);
    }

    dprintf2(("supdrvOSGipMap: returns %d\n", rc));
    return rc;
}


/**
 * Maps the GIP into user space.
 *
 * @returns negative errno.
 * @param   pDevExt     Instance data.
 */
int VBOXCALL supdrvOSGipUnmap(PSUPDRVDEVEXT pDevExt, PSUPGLOBALINFOPAGE pGip)
{
    dprintf2(("supdrvOSGipUnmap: pGip=%p\n", pGip));
    if (current->mm)
    {
        down_write(&current->mm->mmap_sem);
        MY_DO_MUNMAP(current->mm, (unsigned long)pGip, PAGE_SIZE);
        up_write(&current->mm->mmap_sem);
    }
    dprintf2(("supdrvOSGipUnmap: returns 0\n"));
    return 0;
}


/**
 * Resumes the GIP updating.
 *
 * @param   pDevExt     Instance data.
 */
void  VBOXCALL  supdrvOSGipResume(PSUPDRVDEVEXT pDevExt)
{
    dprintf2(("supdrvOSGipResume:\n"));
    ASMAtomicXchgU8(&pDevExt->fGIPSuspended, false);
#ifdef CONFIG_SMP
    if (pDevExt->pGip->u32Mode != SUPGIPMODE_ASYNC_TSC)
#endif
        mod_timer(&g_GipTimer, jiffies);
#ifdef CONFIG_SMP
    else
    {
        mod_timer(&g_GipTimer, jiffies);
        smp_call_function(VBoxSupGipResumePerCpu, pDevExt, 0 /* retry */, 1 /* wait */);
    }
#endif
}


#ifdef CONFIG_SMP
/**
 * Callback for resuming GIP updating on the other CPUs.
 *
 * This is only used when the GIP is in async tsc mode.
 *
 * @param   pvUser  Pointer to the device instance.
 */
static void VBoxSupGipResumePerCpu(void *pvUser)
{
    PSUPDRVDEVEXT pDevExt = (PSUPDRVDEVEXT)pvUser;
    uint8_t iCPU = ASMGetApicId();

    if (RT_UNLIKELY(iCPU >= RT_ELEMENTS(pDevExt->pGip->aCPUs)))
    {
        printk("vboxdrv: error: apicid=%d max=%lu cpuid=%d\n",
               iCPU, (unsigned long)RT_ELEMENTS(pDevExt->pGip->aCPUs), smp_processor_id());
        return;
    }

    pDevExt->aCPUs[iCPU].iSmpProcessorId = smp_processor_id();
    mod_timer(&pDevExt->aCPUs[iCPU].Timer, jiffies);
}
#endif /* CONFIG_SMP */


/**
 * Suspends the GIP updating.
 *
 * @param   pDevExt     Instance data.
 */
void  VBOXCALL  supdrvOSGipSuspend(PSUPDRVDEVEXT pDevExt)
{
#ifdef CONFIG_SMP
    unsigned i;
#endif
    dprintf2(("supdrvOSGipSuspend:\n"));
    ASMAtomicXchgU8(&pDevExt->fGIPSuspended, true);

    if (timer_pending(&g_GipTimer))
        del_timer_sync(&g_GipTimer);
#ifdef CONFIG_SMP
    for (i = 0; i < RT_ELEMENTS(pDevExt->aCPUs); i++)
        if (timer_pending(&pDevExt->aCPUs[i].Timer))
            del_timer_sync(&pDevExt->aCPUs[i].Timer);
#endif
}


/**
 * Get the current CPU count.
 * @returns Number of cpus.
 */
unsigned VBOXCALL supdrvOSGetCPUCount(void)
{
#ifdef CONFIG_SMP
# ifdef num_present_cpus
    return num_present_cpus();
# else
    return smp_num_cpus;
# endif
#else
    return 1;
#endif
}

/**
 * Force async tsc mode.
 * @todo add a module argument for this.
 */
bool VBOXCALL  supdrvOSGetForcedAsyncTscMode(void)
{
    return false;
}


/**
 * Converts a supdrv error code to an linux error code.
 *
 * @returns corresponding linux error code.
 * @param   rc  supdrv error code (SUPDRV_ERR_* defines).
 */
static int     VBoxSupDrvErr2LinuxErr(int rc)
{
    switch (rc)
    {
        case 0:                             return 0;
        case SUPDRV_ERR_GENERAL_FAILURE:    return -EACCES;
        case SUPDRV_ERR_INVALID_PARAM:      return -EINVAL;
        case SUPDRV_ERR_INVALID_MAGIC:      return -EILSEQ;
        case SUPDRV_ERR_INVALID_HANDLE:     return -ENXIO;
        case SUPDRV_ERR_INVALID_POINTER:    return -EFAULT;
        case SUPDRV_ERR_LOCK_FAILED:        return -ENOLCK;
        case SUPDRV_ERR_ALREADY_LOADED:     return -EEXIST;
        case SUPDRV_ERR_PERMISSION_DENIED:  return -EPERM;
        case SUPDRV_ERR_VERSION_MISMATCH:   return -ENOSYS;
        case SUPDRV_ERR_IDT_FAILED:         return -1000;
    }

    return -EPERM;
}


RTDECL(int) SUPR0Printf(const char *pszFormat, ...)
{
#if 1
    va_list args;
    char    szMsg[512];

    va_start(args, pszFormat);
    vsnprintf(szMsg, sizeof(szMsg) - 1, pszFormat, args);
    szMsg[sizeof(szMsg) - 1] = '\0';
    printk("%s", szMsg);
    va_end(args);
#else
    /* forward to printf - needs some more GCC hacking to fix ebp... */
    __asm__ __volatile__ ("mov %0, %esp\n\t"
                          "jmp %1\n\t",
                          :: "r" ((uintptr_t)&pszFormat - 4),
                             "m" (printk));
#endif
    return 0;
}


/** Runtime assert implementation for Linux Ring-0. */
RTDECL(void) AssertMsg1(const char *pszExpr, unsigned uLine, const char *pszFile, const char *pszFunction)
{
    printk("!!Assertion Failed!!\n"
             "Expression: %s\n"
             "Location  : %s(%d) %s\n",
             pszExpr, pszFile, uLine, pszFunction);
}


/** Runtime assert implementation for Linux Ring-0. */
RTDECL(void) AssertMsg2(const char *pszFormat, ...)
{   /* forwarder. */
    va_list ap;
    char    msg[256];

    va_start(ap, pszFormat);
    vsnprintf(msg, sizeof(msg) - 1, pszFormat, ap);
    msg[sizeof(msg) - 1] = '\0';
    printk("%s", msg);
    va_end(ap);
}


/* GCC C++ hack. */
unsigned __gxx_personality_v0 = 0xcccccccc;


module_init(VBoxSupDrvInit);
module_exit(VBoxSupDrvUnload);

MODULE_AUTHOR("innotek GmbH");
MODULE_DESCRIPTION("VirtualBox Support Driver");
MODULE_LICENSE("GPL");
#ifdef MODULE_VERSION
#define xstr(s) str(s)
#define str(s) #s
MODULE_VERSION(VBOX_VERSION_STRING " (" xstr(SUPDRVIOC_VERSION) ")");
#endif