code: 9ferno

ref: fa18daa2f56b306b151781523e877bd07d62f799
dir: /os/pc64/memory.c/

View raw version
#include "u.h"
#include "../port/lib.h"
#include "mem.h"
#include "dat.h"
#include "fns.h"
#include "io.h"
#include "ureg.h"

/*
memory.c identify memory from bios, e820
	puts memory entries to memmap -> conf.mem[] -> xlist.hole's
	the kernel low level page allocator allocates from the xlist holes (xspanalloc)
	inferno's meminit() identifies all memory
	9front splits memory initialization to meminit0() and meminit().
		after meminit0(), it runs archinit() to ensure architecture specific memory
		initialization is accounted for.
 */

#define DP if(1){}else print

enum {
	MemUPA		= 0,	/* unbacked physical address */
	MemUMB		= 1,	/* upper memory block (<16MB) */
	MemRAM		= 2,	/* physical memory */
	MemACPI		= 3,	/* ACPI tables */
	MemReserved	= 4,	/* don't allocate */

	KB = 1024,
};

u64	MemMin;		/* set by l.s */

/* TODO just use xspanalloc. I do not know what the memmapalloc() does. It does not seem to work anyway. inferno pc does not use it either? can refactor all this code.
 */
void*
rampage(void)
{
	uintptr pa;

	if(conf.mem[0].npage != 0)
		return xspanalloc(BY2PG, BY2PG, 0);

	/*
	 * Allocate from the map directly to make page tables.
	 */
	DP("before calling rampage\n");
	/* memmapdump(); */
	pa = memmapalloc(-1, BY2PG, BY2PG, MemRAM);
	if(pa == -1)
		panic("rampage: out of memory\n");
	DP("rampage returned 0x%p\n", pa);
	return (void*)pa;
}

static void
mapkzero(uintptr base, uintptr len, int type)
{
	uintptr flags, n;

	DP("mapkzero base 0x%p len %llud 0x%llux type 0x%x\n",
		base, len, len, type);
	if(base < MemMin && base+len > MemMin){
		mapkzero(base, MemMin-base, type);
		len = base+len-MemMin;
		base = MemMin;
	}

	switch(type){
	default:
		return;
	case MemRAM:
		if(base < MemMin)
			return;
		flags = PTEGLOBAL|PTEWRITE|PTEVALID;
		break;
	case MemUMB:
		flags = PTEGLOBAL|PTEWRITE|PTEUNCACHED|PTEVALID;
		break;
	}
	pmap(base, flags, len);
}

/*
 * map kernel text segment readonly
 * and everything else no-execute.
 */
static void
kernelro(void)
{
	uintptr *pte, psz, va;

	for(va = KTZERO; va <= (uintptr)etext; va += psz){
		psz = PGLSZ(0);
		pte = mmuwalk((uintptr*)PML4ADDR, va, 0, 0);
		if(pte == nil){
			print("kernelro va 0x%p\n", va);
			panic("kernelro");
		}
		if((*pte & PTEVALID) == 0){
			print("kernelro invalid page va 0x%p pte 0x%p *pte 0x%zux\n",
				va, pte, *pte);
			panic("kernelro invalid page\n");
		}
		if(va >= KTZERO && va < (uintptr)etext){
			*pte &= ~PTEWRITE;
			*pte |= PTEGLOBAL;
		}
	}
	mmuflushtlb();
}

static uintptr
ebdaseg(void)
{
	uchar *bda;

	if(memcmp(KADDR(0xfffd9), "EISA", 4) != 0)
		return 0;
	bda = KADDR(0x400);
	return ((bda[0x0f]<<8)|bda[0x0e]) << 4;
}

static uintptr
convmemsize(void)
{
	uintptr top;
	uchar *bda;

	bda = KADDR(0x400);
	top = ((bda[0x14]<<8) | bda[0x13])*KB;

	if(top < 64*KB || top > 640*KB)
		top = 640*KB;	/* sanity */

	/* Reserved for BIOS tables */
	top -= 1*KB;

	return top;
}

static void
lowraminit(void)
{
	uintptr base, pa, len;
	uchar *p;

	/*
	 * Discover the memory bank information for conventional memory
	 * (i.e. less than 640KB). The base is the first location after the
	 * bootstrap processor MMU information and the limit is obtained from
	 * the BIOS data area.
	 */
	base = PADDR(CPU0END);
	pa = convmemsize();
	if(base < pa)
		memmapadd(base, pa-base, MemRAM);

	/* Reserve BIOS tables */
	memmapadd(pa, 1*KB, MemReserved);

	/* Reserve EBDA */
	if((pa = ebdaseg()) != 0)
		memmapadd(pa, 1*KB, MemReserved);
	memmapadd(0xA0000-1*KB, 1*KB, MemReserved);

	/* Reserve the VGA frame buffer */
	umballoc(0xA0000, 128*KB, 0);

	/* Reserve VGA ROM */
	memmapadd(0xC0000, 64*KB, MemReserved);

	/*
	 * Scan the Upper Memory Blocks (0xD0000->0xF0000) for device BIOS ROMs.
	 * This should start with a two-byte header of 0x55 0xAA, followed by a
	 * byte giving the size of the ROM in 512-byte chunks.
	 * These ROM's must start on a 2KB boundary.
	 */
	for(p = (uchar*)KADDR(0xD0000); p < (uchar*)KADDR(0xF0000); p += len){
		len = 2*KB;
		if(p[0] == 0x55 && p[1] == 0xAA){
			if(p[2] != 0)
				len = p[2]*512;
			memmapadd(PADDR(p), len, MemReserved);
			len = ROUND(len, 2*KB);
		}
	}

	/* Reserve BIOS ROM */
	memmapadd(0xF0000, 64*KB, MemReserved);
}

int
checksum(void *v, int n)
{
	uchar *p, s;

	s = 0;
	p = v;
	while(n-- > 0)
		s += *p++;
	return s;
}

static void*
sigscan(uchar *addr, int len, char *sig, int size, int step)
{
	uchar *e, *p;
	int sl;

	sl = strlen(sig);
	e = addr+len-(size > sl ? size : sl);
	for(p = addr; p <= e; p += step){
		if(memcmp(p, sig, sl) != 0)
			continue;
		if(size && checksum(p, size) != 0)
			continue;
		return p;
	}
	return nil;
}

void*
sigsearch(char* signature, int size)
{
	uintptr p;
	void *r;

	/*
	 * Search for the data structure:
	 * 1) within the first KiB of the Extended BIOS Data Area (EBDA), or
	 * 2) within the last KiB of system base memory if the EBDA segment
	 *    is undefined, or
	 * 3) within the BIOS ROM address space between 0xf0000 and 0xfffff
	 *    (but will actually check 0xe0000 to 0xfffff).
	 */
	if((p = ebdaseg()) != 0){
		if((r = sigscan(KADDR(p), 1*KB, signature, size, 16)) != nil)
			return r;
	}
	if((r = sigscan(KADDR(convmemsize()), 1*KB, signature, size, 16)) != nil)
		return r;

	/* hack for virtualbox: look in KiB below 0xa0000 */
	if((r = sigscan(KADDR(0xA0000-1*KB), 1*KB, signature, size, 16)) != nil)
		return r;

	return sigscan(KADDR(0xE0000), 128*KB, signature, size, 16);
}

void*
rsdsearch(void)
{
	static char signature[] = "RSD PTR ";
	uintptr base, size;
	uchar *v, *p;

	if((p = sigsearch(signature, 36)) != nil)
		return p;
	if((p = sigsearch(signature, 20)) != nil)
		return p;

	for(base = memmapnext(-1, MemACPI); base != -1; base = memmapnext(base, MemACPI)){
		size = memmapsize(base, 0);
		if(size == 0 || size > 0x7fffffff)
			continue;
		if((v = vmap(base, size)) != nil){
			p = sigscan(v, size, signature, 36, 4);
			if(p == nil)
				p = sigscan(v, size, signature, 20, 4);
			vunmap(v, size);
			if(p != nil)
				return vmap(base + (p - v), 64);
		}
	}
	return nil;
}

/*
 * Give out otherwise-unused physical address space
 * for use in configuring devices.  Note that upaalloc
 * does not map the physical address into virtual memory.
 * Call vmap to do that.
 */
uintptr
upaalloc(uintptr pa, u32 size, u32 align)
{
	DP("before memmapalloc pa 0x%p size 0x%x %d\n",
		pa, size, size);
	return memmapalloc(pa, size, align, MemUPA);
}

uintptr
upamalloc(uintptr pa, u32 size, u32 align)
{
	DP("before memmapalloc pa 0x%p size 0x%x %d\n",
		pa, size, size);
	return memmapalloc(pa, size, align, MemUPA);
}

uintptr
upaallocwin(uintptr pa, u32 win, u32 size, u32 align)
{
	uvlong a, base, top = pa + win;

	for(base = memmapnext(-1, MemUPA); base != -1 && base < top; base = memmapnext(base, MemUPA)){
		if(base < pa){
			if(pa >= base + memmapsize(base, 0))
				continue;
			base = pa;
		}
		a = upaalloc(base, size, align);
		if(a != -1)
			return a;
	}
	return -1ULL;
}

void
upafree(uintptr pa, u32 size)
{
	memmapfree(pa, size, MemUPA);
}

/*
 * Allocate memory from the upper memory blocks.
 */
uintptr
umballoc(uintptr pa, u32 size, u32 align)
{
	return (uintptr)memmapalloc(pa == -1UL ? -1ULL : (uvlong)pa, size, align, MemUMB);
}

void
umbfree(uintptr pa, u32 size)
{
	memmapfree(pa, size, MemUMB);
}

static void
umbexclude(void)
{
	uintptr pa, size;
	char *op, *p, *rptr;

	if((p = getconf("umbexclude")) == nil)
		return;

	while(p && *p != '\0' && *p != '\n'){
		op = p;
		pa = strtoul(p, &rptr, 0);
		if(rptr == nil || rptr == p || *rptr != '-'){
			print("umbexclude: invalid argument <%s>\n", op);
			break;
		}
		p = rptr+1;

		size = strtoul(p, &rptr, 0) - pa + 1;
		if(size <= 0){
			print("umbexclude: bad range <%s>\n", op);
			break;
		}
		if(rptr != nil && *rptr == ',')
			*rptr++ = '\0';
		p = rptr;

		memmapalloc(pa, size, 0, MemUMB);
	}
}

static void
mtrrexclude(int type, char *expect)
{
	uintptr base, top, next, pa;
	char *attr;

	for(base = memmapnext(-1, type); base != -1; base = memmapnext(base, type)){
		top = base + memmapsize(base, 0);
		for(pa = base; pa < top; pa = next){
			next = top;
			attr = mtrrattr(pa, &next);
			if(attr != nil && strcmp(attr, expect) != 0){
				if(next > top)
					next = top;
				memmapadd(pa, next - pa, MemReserved);
			}
			base = pa;
		}
	}
}

static int
e820scan(void)
{
	uintptr base, top, size;
	int type;
	char *s;

	/* passed by bootloader */
	if((s = getconf("*e820")) == nil)
		if((s = getconf("e820")) == nil)
			return -1;

	for(;;){
		while(*s == ' ')
			s++;
		if(*s == 0)
			break;
		type = 1;
		if(s[1] == ' '){	/* new format */
			type = s[0] - '0';
			s += 2;
		}
		base = strtoull(s, &s, 16);
		if(*s != ' ')
			break;
		top  = strtoull(s, &s, 16);
		if(*s != ' ' && *s != 0)
			break;
		if(base >= top)
			continue;
		switch(type){
		case 1:
			memmapadd(base, top - base, MemRAM);
			break;
		case 3:
			memmapadd(base, top - base, MemACPI);
			break;
		default:
			memmapadd(base, top - base, MemReserved);
		}
	}

	/* RAM needs to be writeback */
	mtrrexclude(MemRAM, "wb");

	return 0;
}

/*
21:00 < joe7> ori, you around? I am trying to figure out how ramscan() works. When writing the pattern, why does it do this: *k0 = ~pat; ?
21:01 < joe7> that value is not used anywhere else again in that subroutine.
21:06 < joe7> nemo's kernel book on page 51 asks the same question too.
21:06 < joe7> It says, "the author is saving the value actually stored at address KZERO, can you guess why?"
0:36 < cinap_lenrek> joe7: it is to detect aliases
00:36 < cinap_lenrek> joe7: it is a RAM tester basically
00:36 < cinap_lenrek> joe7: you should not use this at all
00:37 < cinap_lenrek> in modern machines, this approach is impossible
00:37 < cinap_lenrek> because ram now contains life firmware for devices and megabytes of persistent firmware code
00:40 < cinap_lenrek> if you write to that or damage that info the whole machine will just lock up or brick itself
04:38 < joe7> cinap_lenrek: joe7: you should not use this at all -- which one? the ramscan() or *k = pat?
04:39 < joe7> I see ramscan in 9front 9/pc/memory.c too
04:39 < joe7> Iif(e820scan() < 0)
04:39 < joe7> IIramscan(MemMin, -((uintptr)MemMin), 4*MB);
04:39 < joe7> that is how the ram is being discovered
04:46 < cinap_lenrek> e820scan() just reads the *e820= plan9.ini parameter
04:46 < cinap_lenrek> ramscan() is a fallback
04:46 < cinap_lenrek> when theres no memory map
04:46 < cinap_lenrek> like on reeeeeally old machines
04:46 < cinap_lenrek> or when bios is totally broken
04:47 < cinap_lenrek> dont worry about it
04:47 < cinap_lenrek> theres not really a point in supporting it anymore
04:47 < cinap_lenrek> as i said
04:47 < cinap_lenrek> it is a destructive memory test
04:47 < joe7> oh, ok. I must have screwed up something as it is going to ramscan() each time.
04:47 < joe7> I will fix it. Thanks.
 */
static void
ramscan(uintptr pa, uintptr top)
{
	uintptr save, pat, seed, *v, *k0, *pte;
	uintptr i, n, w;
	char *attr;
	uintptr chunk;

	chunk = PGLSZ(0);
	pa += chunk-1;
	pa &= ~(chunk-1);
	top &= ~(chunk-1);

	n = chunk/sizeof(*v);
	w = BY2PG/sizeof(*v);

	k0 = KADDR(0);
	save = *k0;

	DP("ramscan\n");
	pat = 0x12345678UL;
	for(; pa < top; pa += chunk){
		DP("ramscan pa 0x%p\n", pa);
		v = (uintptr*)pa;
		attr = mtrrattr(pa, nil);
		if(attr != nil && strcmp(attr, "wb") != 0)
			goto Skip;

		/* write pattern */
		seed = pat;
		if((pte = mmuwalk((uintptr*)PML4ADDR, pa, 0, 1)) == nil)
			continue;
		*pte = pa|PTEWRITE|PTEVALID;
		for(i = 0; i < n; i += w){
			pat += 0x3141526UL;
			v[i] = pat;
			*k0 = ~pat;
			if(v[i] != pat)
				goto Bad;
		}

		/* verify pattern */
		pat = seed;
		for(i = 0; i < n; i += w){
			pat += 0x3141526UL;
			if(v[i] != pat)
				goto Bad;
		}

		memmapadd(pa, chunk, MemRAM);
		mapkzero(pa, chunk, MemRAM);
		continue;

	Bad:
		*pte = 0;

	Skip:
		if(pa+chunk <= 16*MB)
			memmapadd(pa, chunk, MemUMB);

		/*
		 * If we encounter a chunk of missing memory
		 * at a sufficiently high offset, call it the end of
		 * memory.  Otherwise we run the risk of thinking
		 * that video memory is real RAM.
		 */
		if(pa >= 32*MB)
			break;
	}

	*k0 = save;
}

void
showpagetables(uintptr *pml4)
{
	uintptr *epml4, pml4e, *pdp, *epdp, pdpe, *pd, *epd, pde, *pt, *ept, pte, cr3, pa;

	cr3 = getcr3();
	print("CR3 0x%zux cpu0pml4 0x%p\n"
		"\tpml4 base address 0x%zux\n"
		"\tpage-level writethrough bit 0x%zux"
		" page-level cache disable bit 0x%zux\n",
		cr3, PML4ADDR, cr3&(~0xFFF),
		cr3&(1<<4)>>4,
		cr3&(1<<3)>>3);
	epml4 = pml4+512;
	for(; pml4 != epml4; pml4++){
		if(*pml4 == 0)
			continue;
		pml4e = *pml4;
		pdp = (uintptr*)(pml4e&(~0xFFF));
		print("pml4 0x%p has 0x%zx pdp base address 0x%p\n", pml4, pml4e, pdp);
		epdp = pdp + 512;
		for(; pdp != epdp; pdp++){
			if(*pdp == 0)
				continue;
			pdpe = *pdp;
			pd = (uintptr*)(pdpe&(~0xFFF));
			epd = pd + 512;
			print("\tpdp 0x%p has 0x%zx pd base address 0x%p\n",
				pdp, pdpe, pd);
			for(; pd != epd; pd++){
				if(*pd == 0)
					continue;
				pde = *pd;
				pt = (uintptr*)(pde&(~0xFFF));
				print("\t\tpd 0x%p has 0x%zx page base address 0x%p\n",
					pd, pde, pt);
				ept = pt + 512;
				for(; pt != ept; pt++){
					if(*pt == 0)
						continue;
					pte = *pt;
					pa = (uintptr)(pte&(~0xFFF));
					print("\t\t\tpt 0x%p has 0x%zx address 0x%zx\n",
						pt, pte, pa);
				}
			}
		}
	}
}

/*
 * Sort out initial memory map and discover RAM.
 */
void
meminit0(void)
{
	uintptr prevbase = 0, base, size = 0;

	print("Memory Configuration\n"
		"\tMemMin 0x%llux end 0x%p KZERO 0x%x KDZERO 0x%x\n"
		"\tKTZERO 0x%x etext 0x%p\n\tCPU0END 0x%llux\n"
		"\tPADDR(PGROUND((uintptr)end)) 0x%zux\n"
		"\tMemMin-PADDR(PGROUND((uintptr)end)) 0x%zux\n",
		MemMin, end, KZERO, KDZERO, KTZERO, etext, (uintptr)CPU0END,
		PADDR(PGROUND((uintptr)end)), MemMin-PADDR(PGROUND((uintptr)end)));
	/*
	 * Add the already mapped memory after the kernel.
	 */
	if(MemMin < PADDR(PGROUND((uintptr)end)))
		panic("kernel too big");
	memmapadd(PADDR(PGROUND((uintptr)end)), MemMin-PADDR(PGROUND((uintptr)end)), MemRAM);

	/*
	 * Memory below MemMin is reserved for the kernel.
	 * Also, set the kernel text pages read only
	 */
	memreserve(PADDR(KDZERO), PADDR(PGROUND((uintptr)MemMin))-PADDR(KDZERO));
	kernelro();

	/*
	 * Addresses below 16MB default to be upper
	 * memory blocks usable for ISA devices.
	 */
	memmapadd(0, 16*MB, MemUMB);

	/*
	 * Everything between 16MB and 4GB defaults
	 * to unbacked physical addresses usable for
	 * device mappings.
	 */
	memmapadd(16*MB, (u32)-16*MB, MemUPA);

	/*
	 * Discover conventional RAM, ROMs and UMBs.
	 */
	lowraminit();

	/*
	 * Discover more RAM and map to KZERO.
	 */
	if(e820scan() < 0)
		ramscan(MemMin, -((uintptr)MemMin));

	/*
	 * Exclude UMB's and UPA's with unusual cache attributes.
	 */
	mtrrexclude(MemUMB, "uc");
	mtrrexclude(MemUPA, "uc");
}

/*
 * Until the memory map is finalized by meminit(),
 * archinit() should reserve memory of discovered BIOS
 * and ACPI tables by calling memreserve() to prevent
 * them from getting allocated and trashed.
 * This is due to the UEFI and BIOS memory map being
 * unreliable and sometimes marking these ranges as RAM.
 */
void
memreserve(uintptr pa, uintptr size)
{
	assert(conf.mem[0].npage == 0);

	size += (pa & BY2PG-1);
	size &= ~(BY2PG-1);
	pa &= ~(BY2PG-1);
	memmapadd(pa, size, MemReserved);
}

/*
 * Finalize the memory map:
 *  (re-)map the upper memory blocks
 *  allocate all usable ram to the conf.mem[] banks
 *  memory is allocated to memory maps -> conf.mem[] -> xlists.hole
 */
void
meminit(void)
{
	uintptr base, size;
	Confmem *cm;

	umbexclude();
	for(base = memmapnext(-1, MemUMB); base != -1; base = memmapnext(base, MemUMB)){
		size = memmapsize(base, BY2PG) & ~(BY2PG-1);
		if(size != 0)
			mapkzero(PGROUND(base), size, MemUMB);
	}

	cm = &conf.mem[0];
	for(base = memmapnext(-1, MemRAM); base != -1; base = memmapnext(base, MemRAM)){
		size = memmapsize(base, BY2PG) & ~(BY2PG-1);
		if(size == 0)
			continue;
		if(cm >= &conf.mem[nelem(conf.mem)]){
			print("meminit: out of entries, loosing: %#p (%llud)\n", base, (uvlong)size);
			continue;
		}
		if(base < MemMin){
			print("meminit: ignoring RAM below MemMin base 0x%p size 0x%zd\n", base, size);
			continue;
		}
		cm->base = memmapalloc(base, size, BY2PG, MemRAM);
		if(cm->base == -1)
			continue;
		base = cm->base;
		cm->npage = size/BY2PG;
		cm++;
	}

	memmapdump();
	// showpagetables((uintptr*)PML4ADDR);
	//showpagetables((uintptr*)PML4ADDR);
}