#include "platform.h"
#include "memmap.h"
#include "printf.h"
#include "mpu.h"


#define DOMAIN_NUM_STORAGE_RAM		1
#define DOMAIN_NUM_ROM				2	//so we can write to it if desired


struct EarlyInitParams {
	uint32_t nextCodeAddr[2];		//must be first, in: where we intend to go next. out: where we shoudl go instead
	uint32_t ttbr;
	uint32_t domainControlReg;
	uint32_t cpuControlReg;
	void* nextSecondLevel;	//just pass this back to us when we have mmu
};


bool mpuInit(void)
{
	//enabling data cache requires it to be properly invalidated. we do it here since doing it earlier is hard
	
	loge("%s\n", __func__);
	
	
	return true;	//nothing to do, we did ti all earlier
}

bool mpuSetStackGuard(uint32_t addr)	//addr == 0 to disable
{
	//mmu is not granular enough - leave it to the platform
	return platSetStackGuard(addr);
}


static void mpuPrvEarlyMapSections(uint32_t *topLevel, uint32_t pa, uint32_t va, uint32_t flags, uint32_t numMb)
{
	uint32_t i, descr;
	
	for (i = 0, descr = pa | flags | 2; i < numMb; i++, descr += 1 << 20)
		topLevel[i + (va >> 20)] = descr;
}

static void	mpuEarlyMap4Kpieces(uint32_t *secondLevel, uint32_t positionalOffset, uint32_t pa, uint32_t flags, uint32_t numPieces)
{
	uint32_t i, j, descr;
	
	secondLevel += positionalOffset;
	
	for (i = 0, descr = pa | flags | 2; i < numPieces; i++, descr += 1 << 12) {
		
		*secondLevel++ = descr;
	}
}

static void	mpuEarlyMap64Kpieces(uint32_t *secondLevel, uint32_t positionalOffset, uint32_t pa, uint32_t flags, uint32_t numPieces)
{
	uint32_t i, j, descr;
	
	secondLevel += positionalOffset;
	
	for (i = 0, descr = pa | flags | 1; i < numPieces; i++, descr += 1 << 16) {
		
		//16 times each
		for (j = 0; j < 16; j++)
			*secondLevel++ = descr;
	}
}

static uintptr_t getCurRomPa(void)						//only works while mmu is off!			
{
	extern void vecs(void);
	uint32_t ret;
	
	asm("mov %0, pc":"=r"(ret));
	
	ret -= (uintptr_t)&getCurRomPa;
	ret += (uintptr_t)&vecs;
	
	return ret &~ 0x00000fff;
}

static uintptr_t mpuPrvEarlyP2vCode(uintptr_t v)		//only works while mmu is off!
{
	uint32_t cpuRomPa = getCurRomPa();
	
	if (v >= cpuRomPa && v - cpuRomPa < CPU_ROM_SIZE)
		v += CPU_ROM_BASE - cpuRomPa;
	else
		while(1);
	
	return v;
}

/*
	in the 64k f VA (CPU_PAGETABLE_BASE == 0xffff0000) & PA (CPU_PAGETABLE_PA) that we reserve for pagetables, vectors, etc, we use the memory as follows:
	
	0x0000..0x0040	vectors, later made RO by mmuLockExcPage()
	0x4000..0x7fff	DMA memory (uncached, unbuffered)
	0x8000..0xbfff	second level tables as needed (we only use coarse pagetables, so 256 entries each, 1K each, we have space for 16 of them)
	0xc000..0xffff	top level page table
*/

struct EarlyInitParams* __attribute__((used)) mpuEarlyInitC(struct EarlyInitParams *params)		//return params
{
	uint32_t *topLevel = (uint32_t*)(CPU_PAGETABLE_PA + 0xc000), *secondLevel = 0, *nextSecondLevel = (uint32_t*)(CPU_PAGETABLE_PA + 0x8000);
	uint32_t cpuRomPa = getCurRomPa();
	uint32_t i, descr;
	
	//start off with a full invalid map
	for (i = 0; i < 4096; i++)
		topLevel[i] = 0;
	
	//map in rom (round as neded)
	//AP=0, Domain=0, C=1, B=1, section
	mpuPrvEarlyMapSections(topLevel, cpuRomPa &~ 0xfffff, CPU_ROM_BASE &~ 0xfffff, 0x008 | (DOMAIN_NUM_ROM << 5), ((CPU_ROM_BASE + CPU_ROM_SIZE + (1 << 20) - 1) >> 20) - (CPU_ROM_BASE >> 20));
	
	//map in dynamic ram
	#if (CPU_DYN_RAM_BASE & 0xfffff)
		#error "dyn. ram base VA must be megabyte-aligned"
	#endif
	#if (CPU_DYN_RAM_PA & 0xfffff)
		#error "dyn. ram base PA must be megabyte-aligned"
	#endif
	#if (CPU_DYN_RAM_SIZE & 0xfffff)
		#error "dyn. ram size must be a megabyte-multiple"
	#endif
	//AP=3, Domain=0, C=1, B=1, section
	mpuPrvEarlyMapSections(topLevel, CPU_DYN_RAM_PA, CPU_DYN_RAM_BASE, 0xc0c, CPU_DYN_RAM_SIZE >> 20);
	
	//map in storage ram
	#if (CPU_STORAGE_RAM_BASE & 0xfffff)
		#error "storage ram base VA must be megabyte-aligned"
	#endif
	#if (CPU_STORAGE_RAM_PA & 0xfffff)
		#error "storage ram base PA must be megabyte-aligned"
	#endif
	#if (CPU_STORAGE_RAM_SIZE & 0xfffff)
		#error "storage ram size must be a megabyte-multiple"
	#endif
	//AP=0, Domain=DOMAIN_NUM_STORAGE_RAM, C=1, B=1, section
	mpuPrvEarlyMapSections(topLevel, CPU_STORAGE_RAM_PA, CPU_STORAGE_RAM_BASE, 0x00c | (DOMAIN_NUM_STORAGE_RAM << 5), CPU_STORAGE_RAM_SIZE >> 20);
	
	//an area for d-cache cleaning, AP=3, Domain=0, C=1, B=1, section
	mpuPrvEarlyMapSections(topLevel, CPU_CACHE_CLEAN_PA, CPU_CACHE_CLEAN_BASE, 0xc0c, CPU_CACHE_CLEAN_SIZE >> 20);
	
	//an area for mini-dcache cleaning for xscale
	#ifdef CPU_MINIDCACHE_CLEAN_BASE
		//to make it mini-dcacheable, we need X = 1, C = 1, B = 0, AP=3
		mpuPrvEarlyMapSections(topLevel, CPU_MINIDCACHE_CLEAN_PA, CPU_MINIDCACHE_CLEAN_BASE, 0x1c0a, CPU_MINIDCACHE_CLEAN_SIZE >> 20);
	#endif
	
	#ifdef CPU_FLASH_1_BASE
		mpuPrvEarlyMapSections(topLevel, CPU_FLASH_1_PA, CPU_FLASH_1_BASE, 0xc00, CPU_FLASH_1_SIZE >> 20);
	#endif
	#ifdef CPU_FLASH_2_BASE
		mpuPrvEarlyMapSections(topLevel, CPU_FLASH_2_PA, CPU_FLASH_2_BASE, 0xc00, CPU_FLASH_2_SIZE >> 20);
	#endif
	
	//identity map requested identity regions (ap=3, dom=1, c=0, b=0, sectoin)
	#ifdef CPU_IDENT_MAP_1_BASE
	
		#if (CPU_IDENT_MAP_1_BASE & 0xfffff)
			#error "ident region VA must be megabyte-aligned"
		#endif
		#if (CPU_IDENT_MAP_1_SIZE & 0xfffff)
			#error "ident region size must be a megabyte-multiple"
		#endif
		//AP=3, Domain=0, C=0, B=0, section
		mpuPrvEarlyMapSections(topLevel, CPU_IDENT_MAP_1_BASE, CPU_IDENT_MAP_1_BASE, 0xc00, CPU_IDENT_MAP_1_SIZE >> 20);
	#endif
	
	#ifdef CPU_IDENT_MAP_2_BASE
	
		#if (CPU_IDENT_MAP_2_BASE & 0xfffff)
			#error "ident region VA must be megabyte-aligned"
		#endif
		#if (CPU_IDENT_MAP_2_SIZE & 0xfffff)
			#error "ident region size must be a megabyte-multiple"
		#endif
		//AP=3, Domain=0, C=0, B=0, section
		mpuPrvEarlyMapSections(topLevel, CPU_IDENT_MAP_2_BASE, CPU_IDENT_MAP_2_BASE, 0xc00, CPU_IDENT_MAP_2_SIZE >> 20);
	#endif
	
	//vram: Domain 0, AP=3, C=1, B=0 (XXX: any cpu that does not support write-through cache will not like this)
	#if (CPU_HARDWIRED_VRAM_ADDR >> 20) != ((CPU_HARDWIRED_VRAM_ADDR + CPU_HARDWIRED_VRAM_SIZE - 1) >> 20)
		#error "vram cannot cross megabyte boundary"
	#endif
	#if (CPU_HARDWIRED_VRAM_ADDR & 0xffff)
		#error "vram VA must start on a 64K boundary"
	#endif
	#if (CPU_HARDWIRED_VRAM_PA & 0xffff)
		#error "vram PA must start on a 64K boundary"
	#endif
	#if (CPU_HARDWIRED_VRAM_SIZE & 0xffff)
		#error "vram size must be a 64K-multiple"
	#endif
	
	//alloc a top level descriptor (Dom = 0) and clear it
	secondLevel = nextSecondLevel;
	for (i = 0; i < 256; i++)
		*nextSecondLevel++ = 0;
	topLevel[CPU_HARDWIRED_VRAM_ADDR >> 20] = ((uintptr_t)secondLevel) | 1;
	
	//map in vram
	mpuEarlyMap64Kpieces(secondLevel, (CPU_HARDWIRED_VRAM_ADDR >> 12) & 0xff, CPU_HARDWIRED_VRAM_PA, 0xff8, (CPU_HARDWIRED_VRAM_SIZE + 0xffff) >> 16);
	
	//pagetables and high vecs
	#if (CPU_PAGETABLE_BASE >> 20) != (CPU_KERNEL_GLOBALS_BASE >> 20)
		#error "pagetables and high vecs expected in the smae section!\n"
	#endif
	#if (CPU_PAGETABLE_BASE >> 20) != ((CPU_PAGETABLE_BASE + CPU_PAGETABLE_SIZE - 1) >> 20)
		#error "pagetable mapping cannot cross megabyte boundary"
	#endif
	#if (CPU_KERNEL_GLOBALS_BASE >> 20) != ((CPU_KERNEL_GLOBALS_BASE + CPU_KERNEL_GLOBALS_SIZE - 1) >> 20)
		#error "kernel globals mapping cannot cross megabyte boundary"
	#endif
	
	//alloc a top level (Dom = 0) descriptor and clear it
	secondLevel = nextSecondLevel;
	for (i = 0; i < 256; i++)
		*nextSecondLevel++ = 0;
	topLevel[CPU_PAGETABLE_BASE >> 20] = ((uintptr_t)secondLevel) | 1;
	
	//map in pagetable & vector pages (AP=2, C=0, B=0, Dom = 0)
	mpuEarlyMap64Kpieces(secondLevel, (CPU_PAGETABLE_BASE >> 12) & 0xff, CPU_PAGETABLE_PA, 0xaa0, CPU_PAGETABLE_SIZE >> 16);
	
	//map in kernel globals (AP=3, C=1, B=1)
	mpuEarlyMap64Kpieces(secondLevel, (CPU_KERNEL_GLOBALS_BASE >> 12) & 0xff, CPU_KERNEL_GLOBALS_PA, 0xffc, CPU_KERNEL_GLOBALS_SIZE >> 16);
	
	//if we have sleep state, alloc a 4K page for that
	#ifdef CPU_SLEEP_STATE_BASE
		
		secondLevel = nextSecondLevel;
		for (i = 0; i < 256; i++)
			*nextSecondLevel++ = 0;
		topLevel[CPU_SLEEP_STATE_BASE >> 20] = ((uintptr_t)secondLevel) | 1;
		
		//map in pagetable & vector pages (AP=2, C=0, B=0, Dom = 0)
		mpuEarlyMap4Kpieces(secondLevel, (CPU_SLEEP_STATE_BASE >> 12) & 0xff, CPU_SLEEP_STATE_PA, 0xaa0, CPU_SLEEP_STATE_SIZE >> 12);
	
	#endif
	
	params->nextSecondLevel = nextSecondLevel;
	//xlate all addrs
	for (i = 0; i < sizeof(params->nextCodeAddr) / sizeof(*params->nextCodeAddr); i++)
		params->nextCodeAddr[i] = mpuPrvEarlyP2vCode(params->nextCodeAddr[i]);
	params->ttbr = CPU_PAGETABLE_PA + 0xC000;
	params->domainControlReg = (1 << (DOMAIN_NUM_STORAGE_RAM * 2)) + (1 << (DOMAIN_NUM_ROM * 2)) + 1;	//check perms on rom (RO), storage ram (rO) and all else (as set)
	params->cpuControlReg =	(1 << 0) |		//MMU on
							(1 << 1) |		//aligment checking on
							(0 << 2) | 		//data cache off
							(1 << 3) |		//write buffer on
							(1 << 4) | 		//exception handlers execute in 32-bit mode
							(1 << 5) | 		//no 26-bit checking please
							(1 << 6) | 		//late abort model
							(0 << 7) |		//little endian
							(0 << 8) | 		//S bit is 0
							(1 << 9) |		//R bit is 1
							(0 << 10) |		//F bit unused and zero
							(1 << 11) |		//branch prediction on
							(1 << 12) |		//instruction cache on
							(1 << 13) |		//high vectors used
							(0 << 14) |		//use the faster-on-average cache-replacement strategy
							(0 << 15);		//no legacy LDR behaviour needed
	
	return params;
}

void __attribute__((noinline,  target("arm"))) mmuTlbInval(void)
{
	uint32_t dummy;
	
	asm volatile(
		"mcr   p15, 0, %1, c8, c7, 0		\n\t"
		"mrc   p15, 0, %0, c2, c0, 0		\n\t"
		"mov   %0, %0						\n\t"
		"sub   pc, pc, #4					\n\t"
		:"=r"(dummy)
		:"r"(0)
		:"memory"
	);
}

void __attribute__((noinline,  target("arm"))) mmuDrainWriteBuffer(void)
{
	uint32_t dummy;
	
	asm volatile(
		"mcr   p15, 0, %1, c7, c10, 4		\n\t"
		"mrc   p15, 0, %0, c2, c0, 0		\n\t"
		"mov   %0, %0						\n\t"
		"sub   pc, pc, #4					\n\t"
		"stmfd sp!, {r0}					\n\t"	//erratum 3
		"ldmfd sp!, {r0}					\n\t"
		:"=r"(dummy)
		:"r"(0)
		:"memory"
	);
}

void mmuLockExcPage(void)
{
	uint32_t *topLevel = (uint32_t*)(CPU_PAGETABLE_BASE + 0xc000), *secondLevel;
	uint32_t descr = topLevel[CPU_PAGETABLE_BASE >> 20];
	uint_fast8_t i;
	
	mmuDrainWriteBuffer();
	
	//wil be a coarse descriptor
	if ((descr & 3) != 1)
		fatal("WTF: descr changed 1");
		
	secondLevel = (uint32_t*)(((descr & ~0x3ff) - CPU_PAGETABLE_PA + CPU_PAGETABLE_BASE));
	secondLevel += (CPU_PAGETABLE_BASE >> 12) & 0xff;
		
	//wil be a 64K descriptor
	if ((descr & 3) != 1)
		fatal("WTF: descr changed 2");
	
	for (i = 0; i < 16; i++)
		secondLevel[i] = (secondLevel[i] &~ 0xff0) | 0xa80;	//AP goes to 0 for first 16K, stays at 2 for other, cacheability and bufferablity of still due to DMA memory & pagetables being here
	
	mmuTlbInval();
}


//second entry - remove identity maps, set up vector handlers
void* __attribute__((used)) mpuMidInitC(uint32_t nextSecondLevel, void* retTo)
{
	uint32_t *topLevel = (uint32_t*)(CPU_PAGETABLE_BASE + 0xc000), *vecs = (uint32_t*)CPU_PAGETABLE_BASE;
	uint32_t i;
	
	#ifdef CPU_IDENT_MAP_1_BASE
	
		for (i = 0; i < (CPU_IDENT_MAP_1_SIZE >> 20); i++)
			topLevel[(CPU_IDENT_MAP_1_BASE >> 20) + i] = 0;
	
	#endif
	
	#ifdef CPU_IDENT_MAP_2_BASE
	
		for (i = 0; i < (CPU_IDENT_MAP_2_SIZE >> 20); i++)
			topLevel[(CPU_IDENT_MAP_2_BASE >> 20) + i] = 0;
	
	#endif
	
	mmuTlbInval();
	
	//now we are sure nothing is mapped at 0x00100000, so use that to store location of next avail PA for secondary pargetables. left shift two so we dont accidentally make a mapping
	topLevel[1] = (uintptr_t)nextSecondLevel << 2;
	
	for (i = 0; i < 8; i++)
		*vecs++ = 0xe59ff018;		//ldr pc, [pc, #0x18]
	
	for (i = 0; i < 8; i++)
		vecs[i] = (uintptr_t)(vecs + 8);	//each points to a self loop for now
	
	vecs[8] = 0xeafffffe;					//the self loop

	return retTo;
}

void mpuInstrCacheClearDataCacheClean(uintptr_t addr, int32_t sz)
{
	platInstrCacheClearDataCacheClean(addr, sz);
}

void mpuSetStorageRamWriteable(bool allowWrites)
{
	return mpuSetDomainAccess(DOMAIN_NUM_STORAGE_RAM, allowWrites ? 3 : 1);
}

#ifdef BUILDING_FOR_BIG_ARM
	#pragma GCC push_options
	#pragma GCC target ("arm")
#endif

void __attribute__((noinline)) mpuSetDomainAccess(uint32_t domain, uint32_t bits)	//noinline will stop this form being inlined into thumb and them failing to link
{
	uint32_t and = ~(3 << (2 * domain));
	uint32_t orr = ((bits & 3) << (2 * domain));
	uint32_t dummy, dummy2, dummy3;
	
	asm volatile(
		"	mrs   %1, cpsr					\n\t"
		"	orr   %2, %1, #0x80				\n\t"
		"	msr   cpsr, %2					\n\t"
		"	mrc   p15, 0, %0, c3, c0, 0		\n\t"
		"	and   %0, %3					\n\t"
		"	orr   %0, %4					\n\t"
		"	mcr   p15, 0, %0, c3, c0, 0		\n\t"
		"	msr   cpsr, %1					\n\t"
		:"=&r"(dummy), "=&r"(dummy2), "=&r"(dummy3)
		:"r"(and), "r"(orr)
		:"memory"
	);
}

//noinline will stop this form being inlined into thumb and them failing to link
void __attribute__((noinline)) mmuMapIoSeg(void* va, uintptr_t pa)	//Dom 0, AP=3, C=0, B=0. only once MMU is up
{
	uint32_t *topLevel = (uint32_t*)(CPU_PAGETABLE_BASE + 0xc000);
	
	topLevel[((uintptr_t)va) >> 20] = pa | 0xc02;
	mmuTlbInval();
}

void mmuPrvSleepMapping(bool create)
{
	uint32_t *topLevel = (uint32_t*)(CPU_PAGETABLE_BASE + 0xc000);
	
	if (create)
		topLevel[0] = 0xc02;
	else
		topLevel[0] = 0x000;
	
	mmuTlbInval();
}

void __attribute__((naked, used)) mpuEarlyInit(void)		//entered with mmu off, no stack, must exit with mmu on, return to where we were. cannot corrupt anything other than ABI-allowables
{
	asm volatile(
	
		"1:													\n\t"
		
		//FCSE off
		"	mov   r1, #0									\n\t"
		"	mcr   p15, 0, r1, c13, c0, 0					\n\t"
		
	//we need to call per-device init, but we have nowhere to save the return address, so we get creative.
	//we know that other modes (non-svc) are not yet used
	
		"	msr  cpsr_c, #0xD1								\n\t"	//FIQ mode
		"	nop												\n\t"
		"	bl   machPreRamInit								\n\t"
		"	nop												\n\t"
		"	msr  cpsr_c, #0xD3								\n\t"	//back to SVC mode
		"	nop												\n\t"
		
		"	mov   r1, sp									\n\t"
		"	ldr   sp, =%[tempStack]							\n\t"
		"	mov   r0, sp									\n\t"
		"	stmfd sp!, {r1}									\n\t"
		"	adr   r1, in_mmu								\n\t"
		"	str   lr, [r0, #0]								\n\t"	//store nextCodeAddr[0]
		"	str   r1, [r0, #4]								\n\t"	//store nextCodeAddr[1]
		"	ldr   r12, =mpuEarlyInitC						\n\t"	//returns value we passed in
		"	ldr   r3, =1b									\n\t"	//call it in a position-independent way since
		"	sub   r12, r3									\n\t"	//the thunk gcc generates is position-dependent
		"	adr   r3, 1b									\n\t"
		"	add   r12, r3									\n\t"
		"	blx   r12										\n\t"
		"	ldmfd sp, {sp}									\n\t"
		"	ldr   lr, [r0], #4								\n\t"	//nextCodeAddr[0]
		"	ldmia r0, {r0, r1, r2, r3, r12}					\n\t"	//nextCodeAddr[1], .ttbr, .domainControlReg, .cpuControlReg, .nextSecondLevel
		"	mcr   p15, 0, r1, c2, c0, 0						\n\t"	//set ttbr
		"	mcr   p15, 0, r2, c3, c0, 0						\n\t"	//set domain control reg
		"	mov   r2, #0									\n\t"
		"	mcr   p15, 0, r2, c7, c10, 4					\n\t"	//drain write buffer (watch out for erratum 3)
		"	mcr   p15, 0, r2, c7, c5, 0						\n\t"	//inval all of icache & BTB
		"	mcr   p15, 0, r2, c7, c6, 0						\n\t"	//inval all of dcache & mini dcache
		"	mcr   p15, 0, r2, c7, c5, 6						\n\t"	//clear all of BTB
		"	mcr   p15, 0, r2, c8, c7, 0						\n\t"	//inval all TLBs
		"	mcr   p15, 0, r3, c1, c0, 0						\n\t"	//control reg write (and mmu on)
		"	mrc   p15, 0, r2, c2, c0, 0						\n\t"	//cpwait
		"	mov   r2, r2									\n\t"
		"	sub   pc, pc, #4								\n\t"	//after this mmu is definitely on
		"	bx    r0										\n\t"	//jump to mmu map of this code
		"in_mmu:											\n\t"
		"	mov   r0, r12									\n\t"	//second stage (remove identity maps)
		"	mov   r1, lr									\n\t"	//addr to translate for return
		"	bl    mpuMidInitC								\n\t"
		"	mov   r1, #0									\n\t"
		"	mcr   p15, 0, r1, c8, c7, 0						\n\t"	//inval all TLBs
		"	bx    r0										\n\t"
		:
		:[tempStack] "i"(CPU_KERNEL_GLOBALS_SIZE + CPU_KERNEL_GLOBALS_PA - sizeof(struct EarlyInitParams))
		:"memory", "cc"
	);
}


#ifdef BUILDING_FOR_BIG_ARM
	#pragma GCC pop_options
#endif








