I'm learning LKM programming on FreeBSD, and as a first project I'm trying to write a system call that takes a virtual memory address of a process address space as an argument and returns the corresponding physical address in RAM (if the virtual address in question is mapped into memory – if it's not then the system call returns an error). I'm running on an Intel x64 chip, so I read through Chapter 4 of Volume 3A of the Intel developers' manual, which details how the system handles virtual-to-physical address translation. I believe I've implemented this procedure correctly in my kernel module code, given below:
#include <sys/types.h>
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/module.h>
#include <sys/sysent.h>
#include <sys/kernel.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
struct vtp_args {
unsigned long vaddr;
unsigned long *to_fill; };
/////////////////////////////////////////////////////
//virtual address masks
#define PML5_MASK(x) ((x)&0x01ff000000000000) //bits 56 to 48
#define PML4_MASK(x) ((x)&0x0000ff8000000000) //bits 47 to 39
#define PDPT_MASK(x) ((x)&0x0000007fc0000000) //bits 38 to 30
#define PD_MASK(x) ((x)&0x000000003fe00000) //bits 29 to 21
#define PT_MASK(x) ((x)&0x00000000001ff000) //bits 20 to 12
/////////////////////////////////////////////////////
/////////////////////////////////////////////////////
//page structure entry masks
#define PE_ADDR_MASK(x) ((x)&0x000ffffffffff000) //bits 51 to 12
#define PE_PS_FLAG(x) ( (x) & ((long)1<<7) ) //page size flag
#define PE_P_FLAG(x) ((x)&1) //present flag
/////////////////////////////////////////////////////
/////////////////////////////////////////////////////
#define DMAP_MIN_ADDRESS (0xfffff80000000000)
#define PHYS_TO_VIRT(x) ((x)|DMAP_MIN_ADDRESS)
/////////////////////////////////////////////////////
static int
vtp(struct thread *td, void *args) {
struct vtp_args *uap=args;
unsigned long vaddr=uap->vaddr;
unsigned long *to_fill=uap->to_fill;
//asm block checks to see if 4 or 5-level paging is enabled
//if so, moves the cr3 register into the cr3 variable
//and sets la57_flag to assert whether 4-level or 5-level
int la57_flag=0;
unsigned long cr3=0;
__asm__ __volatile__ (
"mov %%cr0, %%rax;" //check bit 31 of cr0 (PG flag)
"test $0x80000000, %%eax;" //deny request if 0
"jz fail;" //(ie if paging is not enabled)
"mov $0xc0000080, %%ecx;" //check bit 8 of ia32_efer (LME flag)
"rdmsr;" //deny request if 0
"test $0x100, %%eax;" //(module currently can't handle pae paging)
"jz fail;"
"success:\n"
"mov %%cr3, %0;"
"mov %%cr4, %%rax;"
"shr $20, %%rax;"
"and $1, %%rax;"
"mov %%eax, %1;"
"jmp break;"
"fail:\n"
"mov $0, %0;"
"break:\n"
: "=r"(cr3), "=r"(la57_flag)
::"rax", "ecx", "memory");
if(!cr3) {
return EOPNOTSUPP; }
/////////////////////////////////////////////////////
unsigned long psentry=0, paddr=0;
//get pml5e (if applicable)
if(la57_flag) { //5-level paging
psentry=*(unsigned long *)\
PHYS_TO_VIRT( PE_ADDR_MASK(cr3)|(PML5_MASK(vaddr)>>51) );
if(!PE_P_FLAG(psentry)) {
return EFAULT; }}
else {
psentry=cr3; }
//get pml4e
uprintf("[debug]: cr3: 0x%lx\n", psentry);
uprintf("[debug]: &pml4e: 0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PML4_MASK(vaddr)>>42) ));
psentry=*(unsigned long *)\
PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PML4_MASK(vaddr)>>42) );
uprintf("[debug]: pml4e: 0x%lx\n", psentry);
if(!PE_P_FLAG(psentry)) {
return EFAULT; }
//get pdpte
psentry=*(unsigned long *)\
PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PDPT_MASK(vaddr)>>33) );
uprintf("[debug]: pdpte: 0x%lx\n", psentry);
if(PE_PS_FLAG(psentry)) { //1GB page
//bits (51 to 30) | bits (29 to 0)
paddr=(psentry&0x0ffffc00000000)|(vaddr&0x3fffffff);
return copyout(&paddr, to_fill, sizeof(unsigned long)); }
if(!PE_P_FLAG(psentry)) {
return EFAULT; }
//get pde
psentry=*(unsigned long *)\
PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PD_MASK(vaddr)>>24) );
uprintf("[debug]: pde: 0x%lx\n", psentry);
if(PE_PS_FLAG(psentry)) { //2MB page
//bits (51 to 21) | bits (20 to 0)
paddr=(psentry&0x0ffffffffe0000)|(vaddr&0x1ffff);
return copyout(&paddr, to_fill, sizeof(unsigned long)); }
if(!PE_P_FLAG(psentry)) {
return EFAULT; }
//get pte
psentry=*(unsigned long *)\
PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PT_MASK(vaddr)>>15) );
uprintf("[debug]: pte: 0x%lx\n", psentry);
paddr=(psentry&0x0ffffffffff000)|(vaddr&0xfff);
return copyout(&paddr, to_fill, sizeof(unsigned long)); }
static
struct sysent vtp_sysent = {
2,
vtp };
static int offset=NO_SYSCALL;
static int
load(struct module *module, int cmd, void *arg) {
int error=0;
switch(cmd) {
case MOD_LOAD:
uprintf("loading syscall at offset %d\n", offset);
break;
case MOD_UNLOAD:
uprintf("unloading syscall from offset %d\n", offset);
break;
default:
error=EOPNOTSUPP;
break; }
return error; }
SYSCALL_MODULE(vtp, &offset, &vtp_sysent, load, NULL);
I found the DMAP_MIN_ADDRESS
constant just by grep
ping through the /sys/amd64
directory, and I am fairly confident I have the correct one as the code doesn't cause any kernel panics when called. Once I load the module, I test the syscall with the following code:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/module.h>
int main() {
int x=0;
unsigned long vaddr=(unsigned long)&x;
unsigned long paddr=0;
int syscall_num;
int modid;
struct module_stat stat;
stat.version=sizeof(stat);
if((modid=modfind("sys/vtp"))==-1) {
perror("fatal in modfind");
exit(-1); }
if(modstat(modid, &stat)==-1) {
perror("fatal in modstat");
exit(-1); }
syscall_num=stat.data.intval;
if(syscall(syscall_num, vaddr, &paddr)) {
perror("fatal in syscall");
exit(-1); }
printf("virtual address: %p\n"
"physical address: %p\n",
(void *)vaddr, (void *)paddr);
return 0; }
Unfortunately, I get the following strange output:
$ ./vtp_test
[debug]: cr3: 0x2d48663c
[debug]: &pml4e: 0xfffff8002d48601f
[debug]: pml4e: 0x0
fatal in syscall: Bad address
So, for some reason the "PML4 entry" comes back as 0, which is obviously incorrect. I suspect the issue must be in my implementation of the address resolution algorithm given in the Intel developers' guide, but I can't see where the error is. Can anyone give some insight?
P.S. I am of course running this on a virtual machine, so is it possible this is causing some problem?
Ah, was a stupid mistake; gave incorrect bit-shift values. The following code is corrected (with some additional debug statements added):
#include <sys/types.h>
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/module.h>
#include <sys/sysent.h>
#include <sys/kernel.h>
#include <sys/sysproto.h>
#include <sys/systm.h>
struct vtp_args {
unsigned long vaddr;
unsigned long *to_fill; };
/////////////////////////////////////////////////////
//virtual address masks
#define PML5_MASK(x) ((x)&0x01ff000000000000) //bits 56 to 48
#define PML4_MASK(x) ((x)&0x0000ff8000000000) //bits 47 to 39
#define PDPT_MASK(x) ((x)&0x0000007fc0000000) //bits 38 to 30
#define PD_MASK(x) ((x)&0x000000003fe00000) //bits 29 to 21
#define PT_MASK(x) ((x)&0x00000000001ff000) //bits 20 to 12
/////////////////////////////////////////////////////
/////////////////////////////////////////////////////
//page structure entry masks
#define PE_ADDR_MASK(x) ((x)&0x000ffffffffff000) //bits 51 to 12
#define PE_PS_FLAG(x) ( (x) & ((long)1<<7) ) //page size flag
#define PE_P_FLAG(x) ((x)&1) //present flag
/////////////////////////////////////////////////////
/////////////////////////////////////////////////////
#define DMAP_MIN_ADDRESS (0xfffff80000000000)
#define PHYS_TO_VIRT(x) ((x)|DMAP_MIN_ADDRESS)
/////////////////////////////////////////////////////
static int
vtp(struct thread *td, void *args) {
struct vtp_args *uap=args;
unsigned long vaddr=uap->vaddr;
unsigned long *to_fill=uap->to_fill;
//asm block checks to see if 4 or 5-level paging is enabled
//if so, moves the cr3 register into the cr3 variable
//and sets la57_flag to assert whether 4-level or 5-level
int la57_flag=0;
unsigned long cr3=0;
__asm__ __volatile__ (
"mov %%cr0, %%rax;" //check bit 31 of cr0 (PG flag)
"test $0x80000000, %%eax;" //deny request if 0
"jz fail;" //(ie if paging is not enabled)
"mov $0xc0000080, %%ecx;" //check bit 8 of ia32_efer (LME flag)
"rdmsr;" //deny request if 0
"test $0x100, %%eax;" //(module currently can't handle pae paging)
"jz fail;"
"success:\n"
"mov %%cr3, %0;"
"mov %%cr4, %%rax;"
"shr $20, %%rax;"
"and $1, %%rax;"
"mov %%eax, %1;"
"jmp break;"
"fail:\n"
"mov $0, %0;"
"break:\n"
: "=r"(cr3), "=r"(la57_flag)
::"rax", "ecx", "memory");
if(!cr3) {
return EOPNOTSUPP; }
////////////////////////////////////////////////////////////////////
unsigned long psentry=0, paddr=0;
//pml5e (if applicable)
if(la57_flag) { //5-level paging
printf("[debug]: &pml5e:\t0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(cr3)|(PML5_MASK(vaddr)>>45) ));
psentry=*(unsigned long *)\
PHYS_TO_VIRT( PE_ADDR_MASK(cr3)|(PML5_MASK(vaddr)>>45) );
printf("[debug]: pml5e:\t\t0x%lx\n", psentry);
if(!PE_P_FLAG(psentry)) {
return EFAULT; }}
else {
psentry=cr3; }
//pml4e
uprintf("[debug]: &pml4e:\t0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PML4_MASK(vaddr)>>36) ));
psentry=*(unsigned long *)\
PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PML4_MASK(vaddr)>>36) );
uprintf("[debug]: pml4e:\t\t0x%lx\n", psentry);
if(!PE_P_FLAG(psentry)) {
return EFAULT; }
//pdpte
uprintf("[debug]: &pdpte:\t0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PDPT_MASK(vaddr)>>27) ));
psentry=*(unsigned long *)\
PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PDPT_MASK(vaddr)>>27) );
uprintf("[debug]: pdpte:\t\t0x%lx\n", psentry);
if(PE_PS_FLAG(psentry)) { //1GB page
//bits (51 to 30) | bits (29 to 0)
paddr=(psentry&0x0ffffc00000000)|(vaddr&0x3fffffff);
return copyout(&paddr, to_fill, sizeof(unsigned long)); }
if(!PE_P_FLAG(psentry)) {
return EFAULT; }
//pde
uprintf("[debug]: &pde:\t\t0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PD_MASK(vaddr)>>18) ));
psentry=*(unsigned long *)\
PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PD_MASK(vaddr)>>18) );
uprintf("[debug]: pde:\t\t0x%lx\n", psentry);
if(PE_PS_FLAG(psentry)) { //2MB page
//bits (51 to 21) | bits (20 to 0)
paddr=(psentry&0x0ffffffffe0000)|(vaddr&0x1ffff);
return copyout(&paddr, to_fill, sizeof(unsigned long)); }
if(!PE_P_FLAG(psentry)) {
return EFAULT; }
//pte
uprintf("[debug]: &pte:\t\t0x%lx\n", PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PT_MASK(vaddr)>>9) ));
psentry=*(unsigned long *)\
PHYS_TO_VIRT( PE_ADDR_MASK(psentry)|(PT_MASK(vaddr)>>9) );
uprintf("[debug]: pte:\t\t0x%lx\n", psentry);
paddr=(psentry&0x0ffffffffff000)|(vaddr&0xfff);
return copyout(&paddr, to_fill, sizeof(unsigned long)); }
static
struct sysent vtp_sysent = {
2,
vtp };
static int offset=NO_SYSCALL;
static int
load(struct module *module, int cmd, void *arg) {
int error=0;
switch(cmd) {
case MOD_LOAD:
uprintf("loading syscall at offset %d\n", offset);
break;
case MOD_UNLOAD:
uprintf("unloading syscall from offset %d\n", offset);
break;
default:
error=EOPNOTSUPP;
break; }
return error; }
SYSCALL_MODULE(vtp, &offset, &vtp_sysent, load, NULL);
And yields the much better looking output:
$ ./vtp_test
[debug]: &pml4e: 0xfffff800341a27f8
[debug]: pml4e: 0x80000000341fc067
[debug]: &pdpte: 0xfffff800341fcff8
[debug]: pdpte: 0x341b7067
[debug]: &pde: 0xfffff800341b7ff8
[debug]: pde: 0x34174067
[debug]: &pte: 0xfffff80034174ff0
[debug]: pte: 0x8000000030de9467
virtual address: 0x7fffffffea9c
physical address: 0x30de9a9c