Search code examples
assembly64-bitintelfreebsdkernel-module

FreeBSD module for virtual-to-physical address translation


I'm learning LKM programming on FreeBSD, and as a first project I'm trying to write a system call that takes a virtual memory address of a process address space as an argument and returns the corresponding physical address in RAM (if the virtual address in question is mapped into memory – if it's not then the system call returns an error). I'm running on an Intel x64 chip, so I read through Chapter 4 of Volume 3A of the Intel developers' manual, which details how the system handles virtual-to-physical address translation. I believe I've implemented this procedure correctly in my kernel module code, given below:

#include <sys/types.h>
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/module.h>
#include <sys/sysent.h>
#include <sys/kernel.h>
#include <sys/sysproto.h>
#include <sys/systm.h>

struct vtp_args {
    unsigned long vaddr;
    unsigned long *to_fill; };

/////////////////////////////////////////////////////
//virtual address masks
#define PML5_MASK(x)    ((x)&0x01ff000000000000)    //bits 56 to 48
#define PML4_MASK(x)    ((x)&0x0000ff8000000000)    //bits 47 to 39
#define PDPT_MASK(x)    ((x)&0x0000007fc0000000)    //bits 38 to 30
#define PD_MASK(x)      ((x)&0x000000003fe00000)    //bits 29 to 21
#define PT_MASK(x)      ((x)&0x00000000001ff000)    //bits 20 to 12
/////////////////////////////////////////////////////

/////////////////////////////////////////////////////
//page structure entry masks
#define PE_ADDR_MASK(x) ((x)&0x000ffffffffff000)    //bits 51 to 12
#define PE_PS_FLAG(x)   ( (x) & ((long)1<<7) )      //page size flag
#define PE_P_FLAG(x)    ((x)&1)                     //present flag
/////////////////////////////////////////////////////

/////////////////////////////////////////////////////
#define DMAP_MIN_ADDRESS    (0xfffff80000000000)
#define PHYS_TO_VIRT(x)     ((x)|DMAP_MIN_ADDRESS)
/////////////////////////////////////////////////////

static int
vtp(struct thread *td, void *args) {
    struct vtp_args *uap=args;
    unsigned long vaddr=uap->vaddr;
    unsigned long *to_fill=uap->to_fill;

    //asm block checks to see if 4 or 5-level paging is enabled
    //if so, moves the cr3 register into the cr3 variable
    //and sets la57_flag to assert whether 4-level or 5-level
    int la57_flag=0;
    unsigned long cr3=0;
    __asm__ __volatile__ (
        "mov %%cr0, %%rax;"         //check bit 31 of cr0 (PG flag)
        "test $0x80000000, %%eax;"  //deny request if 0
        "jz fail;"                  //(ie if paging is not enabled)

        "mov $0xc0000080, %%ecx;"   //check bit 8 of ia32_efer (LME flag)
        "rdmsr;"                    //deny request if 0
        "test $0x100, %%eax;"       //(module currently can't handle pae paging)
        "jz fail;"
        
    "success:\n"
        "mov %%cr3, %0;"
        "mov %%cr4, %%rax;"
        "shr $20, %%rax;"
        "and $1, %%rax;"
        "mov %%eax, %1;"
        "jmp break;"
    "fail:\n"
        "mov $0, %0;"
    "break:\n"
    
        : "=r"(cr3), "=r"(la57_flag)
        ::"rax", "ecx", "memory");
    if(!cr3) {
        return EOPNOTSUPP; }
    /////////////////////////////////////////////////////
    unsigned long psentry=0, paddr=0;
    
    //get pml5e (if applicable)
    if(la57_flag) {         //5-level paging
        psentry=*(unsigned long *)\
            PHYS_TO_VIRT( PE_ADDR_MASK(cr3)|(PML5_MASK(vaddr)>>51) );
            if(!PE_P_FLAG(psentry)) {
                return EFAULT; }}
    else {
        psentry=cr3; }
   
    //get pml4e
    uprintf("[debug]: cr3:    0x%lx\n", psentry);
    uprintf("[debug]: &pml4e: 0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PML4_MASK(vaddr)>>42) ));
    psentry=*(unsigned long *)\
        PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PML4_MASK(vaddr)>>42) );
    uprintf("[debug]: pml4e:  0x%lx\n", psentry);
    if(!PE_P_FLAG(psentry)) {
        return EFAULT; }
   
    //get pdpte
    psentry=*(unsigned long *)\
        PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PDPT_MASK(vaddr)>>33) );
    uprintf("[debug]: pdpte:  0x%lx\n", psentry);
    if(PE_PS_FLAG(psentry)) {   //1GB page
        //bits (51 to 30) | bits (29 to 0)
        paddr=(psentry&0x0ffffc00000000)|(vaddr&0x3fffffff);
        return copyout(&paddr, to_fill, sizeof(unsigned long)); }
    if(!PE_P_FLAG(psentry)) {
        return EFAULT; }
   
    //get pde
    psentry=*(unsigned long *)\
        PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PD_MASK(vaddr)>>24) );
    uprintf("[debug]: pde:    0x%lx\n", psentry);
    if(PE_PS_FLAG(psentry)) {   //2MB page
        //bits (51 to 21) | bits (20 to 0)
        paddr=(psentry&0x0ffffffffe0000)|(vaddr&0x1ffff);
        return copyout(&paddr, to_fill, sizeof(unsigned long)); }
    if(!PE_P_FLAG(psentry)) {
        return EFAULT; }
     
    //get pte
    psentry=*(unsigned long *)\
        PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PT_MASK(vaddr)>>15) );
    uprintf("[debug]: pte:    0x%lx\n", psentry);
    paddr=(psentry&0x0ffffffffff000)|(vaddr&0xfff);
    return copyout(&paddr, to_fill, sizeof(unsigned long)); }
     

static
struct sysent vtp_sysent = {
    2,
    vtp };

static int offset=NO_SYSCALL;

static int
load(struct module *module, int cmd, void *arg) {
    int error=0;
    switch(cmd) {
        case MOD_LOAD:
            uprintf("loading syscall at offset %d\n", offset);
            break;
        case MOD_UNLOAD:
            uprintf("unloading syscall from offset %d\n", offset);
            break;
        default:
            error=EOPNOTSUPP;
            break; }
    return error; }

SYSCALL_MODULE(vtp, &offset, &vtp_sysent, load, NULL);

I found the DMAP_MIN_ADDRESS constant just by grepping through the /sys/amd64 directory, and I am fairly confident I have the correct one as the code doesn't cause any kernel panics when called. Once I load the module, I test the syscall with the following code:

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/module.h>

int main() {
    int x=0;
    unsigned long vaddr=(unsigned long)&x;
    unsigned long paddr=0;

    int syscall_num;
    int modid;
    struct module_stat stat;
    stat.version=sizeof(stat);
    if((modid=modfind("sys/vtp"))==-1) {
        perror("fatal in modfind");
        exit(-1); }
    if(modstat(modid, &stat)==-1) {
        perror("fatal in modstat");
        exit(-1); }
    syscall_num=stat.data.intval;
    
    if(syscall(syscall_num, vaddr, &paddr)) {
        perror("fatal in syscall");
        exit(-1); }
    printf("virtual address:    %p\n"
           "physical address:   %p\n",
           (void *)vaddr, (void *)paddr);
    return 0; }

Unfortunately, I get the following strange output:

$ ./vtp_test
[debug]: cr3:    0x2d48663c
[debug]: &pml4e: 0xfffff8002d48601f
[debug]: pml4e:  0x0
fatal in syscall: Bad address

So, for some reason the "PML4 entry" comes back as 0, which is obviously incorrect. I suspect the issue must be in my implementation of the address resolution algorithm given in the Intel developers' guide, but I can't see where the error is. Can anyone give some insight?

P.S. I am of course running this on a virtual machine, so is it possible this is causing some problem?


Solution

  • Ah, was a stupid mistake; gave incorrect bit-shift values. The following code is corrected (with some additional debug statements added):

    #include <sys/types.h>
    #include <sys/param.h>
    #include <sys/proc.h>
    #include <sys/module.h>
    #include <sys/sysent.h>
    #include <sys/kernel.h>
    #include <sys/sysproto.h>
    #include <sys/systm.h>
    
    struct vtp_args {
        unsigned long vaddr;
        unsigned long *to_fill; };
    
    /////////////////////////////////////////////////////
    //virtual address masks
    #define PML5_MASK(x)    ((x)&0x01ff000000000000)    //bits 56 to 48
    #define PML4_MASK(x)    ((x)&0x0000ff8000000000)    //bits 47 to 39
    #define PDPT_MASK(x)    ((x)&0x0000007fc0000000)    //bits 38 to 30
    #define PD_MASK(x)      ((x)&0x000000003fe00000)    //bits 29 to 21
    #define PT_MASK(x)      ((x)&0x00000000001ff000)    //bits 20 to 12
    /////////////////////////////////////////////////////
    
    /////////////////////////////////////////////////////
    //page structure entry masks
    #define PE_ADDR_MASK(x) ((x)&0x000ffffffffff000)    //bits 51 to 12
    #define PE_PS_FLAG(x)   ( (x) & ((long)1<<7) )      //page size flag
    #define PE_P_FLAG(x)    ((x)&1)                     //present flag
    /////////////////////////////////////////////////////
    
    /////////////////////////////////////////////////////
    #define DMAP_MIN_ADDRESS    (0xfffff80000000000)
    #define PHYS_TO_VIRT(x)     ((x)|DMAP_MIN_ADDRESS)
    /////////////////////////////////////////////////////
    
    static int
    vtp(struct thread *td, void *args) {
        struct vtp_args *uap=args;
        unsigned long vaddr=uap->vaddr;
        unsigned long *to_fill=uap->to_fill;
    
        //asm block checks to see if 4 or 5-level paging is enabled
        //if so, moves the cr3 register into the cr3 variable
        //and sets la57_flag to assert whether 4-level or 5-level
        int la57_flag=0;
        unsigned long cr3=0;
        __asm__ __volatile__ (
            "mov %%cr0, %%rax;"         //check bit 31 of cr0 (PG flag)
            "test $0x80000000, %%eax;"  //deny request if 0
            "jz fail;"                  //(ie if paging is not enabled)
    
            "mov $0xc0000080, %%ecx;"   //check bit 8 of ia32_efer (LME flag)
            "rdmsr;"                    //deny request if 0
            "test $0x100, %%eax;"       //(module currently can't handle pae paging)
            "jz fail;"
            
        "success:\n"
            "mov %%cr3, %0;"
            "mov %%cr4, %%rax;"
            "shr $20, %%rax;"
            "and $1, %%rax;"
            "mov %%eax, %1;"
            "jmp break;"
        "fail:\n"
            "mov $0, %0;"
        "break:\n"
        
            : "=r"(cr3), "=r"(la57_flag)
            ::"rax", "ecx", "memory");
        if(!cr3) {
            return EOPNOTSUPP; }
        ////////////////////////////////////////////////////////////////////
        unsigned long psentry=0, paddr=0;
    
        //pml5e (if applicable)
        if(la57_flag) {         //5-level paging
            printf("[debug]: &pml5e:\t0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(cr3)|(PML5_MASK(vaddr)>>45) ));
            psentry=*(unsigned long *)\
                PHYS_TO_VIRT( PE_ADDR_MASK(cr3)|(PML5_MASK(vaddr)>>45) );
            printf("[debug]: pml5e:\t\t0x%lx\n", psentry);
            if(!PE_P_FLAG(psentry)) {
                return EFAULT; }}
        else {
            psentry=cr3; }
    
        //pml4e
        uprintf("[debug]: &pml4e:\t0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PML4_MASK(vaddr)>>36) ));
        psentry=*(unsigned long *)\
            PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PML4_MASK(vaddr)>>36) );
        uprintf("[debug]: pml4e:\t\t0x%lx\n", psentry);
        if(!PE_P_FLAG(psentry)) {
            return EFAULT; }
    
        //pdpte
        uprintf("[debug]: &pdpte:\t0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PDPT_MASK(vaddr)>>27) ));
        psentry=*(unsigned long *)\
            PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PDPT_MASK(vaddr)>>27) );
        uprintf("[debug]: pdpte:\t\t0x%lx\n", psentry);
        if(PE_PS_FLAG(psentry)) {   //1GB page
            //bits (51 to 30) | bits (29 to 0)
            paddr=(psentry&0x0ffffc00000000)|(vaddr&0x3fffffff);
            return copyout(&paddr, to_fill, sizeof(unsigned long)); }
        if(!PE_P_FLAG(psentry)) {
            return EFAULT; }
    
        //pde
        uprintf("[debug]: &pde:\t\t0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PD_MASK(vaddr)>>18) ));
        psentry=*(unsigned long *)\
            PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PD_MASK(vaddr)>>18) );
        uprintf("[debug]: pde:\t\t0x%lx\n", psentry);
        if(PE_PS_FLAG(psentry)) {   //2MB page
            //bits (51 to 21) | bits (20 to 0)
            paddr=(psentry&0x0ffffffffe0000)|(vaddr&0x1ffff);
            return copyout(&paddr, to_fill, sizeof(unsigned long)); }
        if(!PE_P_FLAG(psentry)) {
            return EFAULT; }
    
        //pte
        uprintf("[debug]: &pte:\t\t0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PT_MASK(vaddr)>>9) ));
        psentry=*(unsigned long *)\
            PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PT_MASK(vaddr)>>9) );
        uprintf("[debug]: pte:\t\t0x%lx\n", psentry);
        paddr=(psentry&0x0ffffffffff000)|(vaddr&0xfff);
        return copyout(&paddr, to_fill, sizeof(unsigned long)); }
    
    static
    struct sysent vtp_sysent = {
        2,
        vtp };
    
    static int offset=NO_SYSCALL;
    
    static int
    load(struct module *module, int cmd, void *arg) {
        int error=0;
        switch(cmd) {
            case MOD_LOAD:
                uprintf("loading syscall at offset %d\n", offset);
                break;
            case MOD_UNLOAD:
                uprintf("unloading syscall from offset %d\n", offset);
                break;
            default:
                error=EOPNOTSUPP;
                break; }
        return error; }
    
    SYSCALL_MODULE(vtp, &offset, &vtp_sysent, load, NULL);
    

    And yields the much better looking output:

    $ ./vtp_test
    [debug]: &pml4e:        0xfffff800341a27f8
    [debug]: pml4e:         0x80000000341fc067
    [debug]: &pdpte:        0xfffff800341fcff8
    [debug]: pdpte:         0x341b7067
    [debug]: &pde:          0xfffff800341b7ff8
    [debug]: pde:           0x34174067
    [debug]: &pte:          0xfffff80034174ff0
    [debug]: pte:           0x8000000030de9467
    virtual address:        0x7fffffffea9c
    physical address:       0x30de9a9c