#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "kernel_int.h"
#include "atomic.h"
#include "kernel.h"
#include "printf.h"
#include "irqs.h"
#include "emu.h"


//#define DEBUG_PACE_EMULATION		//allow logging of each opcode pace executes, assuming thus code is enabled, struct EmuContex's stack is increased, and ARM pace is used


#define REG_NO_SP					13
#define REG_NO_LR					14
#define REG_NO_PC					15

#define ARM_SR_BIT_N				0x80000000UL
#define ARM_SR_BIT_Z				0x40000000UL
#define ARM_SR_BIT_C				0x20000000UL
#define ARM_SR_BIT_V				0x10000000UL
#define ARM_SR_BIT_Q				0x08000000UL
#define ARM_SR_BITS_APP				(ARM_SR_BIT_N | ARM_SR_BIT_Z | ARM_SR_BIT_C | ARM_SR_BIT_V | ARM_SR_BIT_Q)
#define ARM_V5_SR_USER_MODE_BITS	0x10

void emuCoreInit(void)
{
	//nothing
}

static void emuCpuUndefInstr(struct EmuCpuState* state, uint32_t instrPc, uint32_t instr, const char* errorMsg)
{
	//TODO
	loge("EMULATED INVALID INSTR 0x%08x @ 0x%08x\n", instr, instrPc);
	asm volatile("udf #0");
	
	while(1);
}

static inline uint32_t emuCpuRegGet(struct EmuCpuState* cpu, uint32_t reg)
{
	if (reg == REG_NO_PC)
		return cpu->regs[REG_NO_PC] + 4;
	
	return cpu->regs[reg];
}

static inline void emuCpuRegSet(struct EmuCpuState* cpu, uint32_t reg, uint32_t val)
{
	cpu->regs[reg] = val;
}

static inline void emuCpuAddrMode1imm(struct EmuCpuState* cpu, uint32_t instr, uint32_t* valP, bool* sP, bool *carryOutP)
{
	uint32_t ret = (instr & 0xFF), by = (instr >> 7) & 0x1E;
	bool s, carryOut;
	
	if (by)
		ret = (ret >> by) | (ret << (32 - by));
	
	s = !!(instr & 0x00100000);
	if (sP)
		*sP = s;
	
	if (s)
		carryOut = by ? (ret >> 31) : !!(cpu->sr & ARM_SR_BIT_C);
	
	if (s && carryOutP)
		*carryOutP = carryOut;
	
	*valP = ret;
}

static inline bool emuCpuAddrMode1reg(struct EmuCpuState* cpu, uint32_t instr, uint32_t* valP, bool* sP, bool *carryOutP)
{
	uint32_t ret, shiftImm = (instr >> 7) & 0x1F;
	bool s, carryOut = false;
	
	s = !!(instr & 0x00100000);
	if (sP)
		*sP = s;
	
	ret = emuCpuRegGet(cpu, instr & 0x0F);
	
	switch ((instr >> 4) & 0x0F) {
		case 0b0001:				//LSL reg
			shiftImm = 0xFF & emuCpuRegGet(cpu, (instr >> 8) & 0x0F);
			if (shiftImm > 32) {
				if (s)
					carryOut = false;
				ret = 0;
				break;
			}
			else if (shiftImm == 32) {
				if (s)
					carryOut = ret & 1;
				ret = 0;
				break;
			}
			//fallthrough
		case 0b0000:				//LSL imm
			if (!shiftImm) {
				if (s)
					carryOut = !!(cpu->sr & ARM_SR_BIT_C);
				break;
			}
			//fallthrough
		case 0b1000:
			if (s)
				carryOut = (ret >> (32 - shiftImm)) & 1;
			ret <<= shiftImm;
			break;
		
		case 0b0011:				//LSR reg
			shiftImm = 0xFF & emuCpuRegGet(cpu, (instr >> 8) & 0x0F);
			if (shiftImm > 32) {
				if (s)
					carryOut = false;
				ret = 0;
				break;
			}
			else if (shiftImm == 32) {
				if (s)
					carryOut = ret >> 31;
				ret = 0;
				break;
			}
			else if (!shiftImm) {
				if (s)
					carryOut = !!(cpu->sr & ARM_SR_BIT_C);
				break;
			}
			goto lsr_imm_nonzero;	//we could fallthrough, but this saves us a few cycles
		case 0b0010:				//LSR imm
			if (!shiftImm) {
				if (s)
					carryOut = ret >> 31;
				ret = 0;
				break;
			}
			//fallthrough
	lsr_imm_nonzero:
		case 0b1010:
			if (s)
				carryOut = (ret >> (shiftImm - 1)) & 1;
			ret >>= shiftImm;
			break;	
		
		case 0b0101:				//ASR reg
			shiftImm = 0xFF & emuCpuRegGet(cpu, (instr >> 8) & 0x0F);
			if (shiftImm >= 32)
				goto asm_imm_32;
			else if (!shiftImm) {
				if (s)
					carryOut = !!(cpu->sr & ARM_SR_BIT_C);
				break;
			}
			goto asm_imm_32_nonzero;	//we could fallthrough, but this saves us a few cycles
		case 0b0100:				//ASR imm
			if (!shiftImm) {
	asm_imm_32:
				if (s)
					carryOut = ret >> 31;
				ret = (((int32_t)ret) >> 16) >> 16;	//to avoid warning of shift by type width
				break;
			}
			//fallthrough
			
		case 0b1100:
	asm_imm_32_nonzero:
			if (s)
				carryOut = (ret >> (shiftImm - 1)) & 1;
			ret = ((int32_t)ret) >> shiftImm;
			break;
		
		case 0b0111:				//ROR reg
			shiftImm = 0xFF & emuCpuRegGet(cpu, (instr >> 8) & 0x0F);
			if (!shiftImm) {
				if (s)
					carryOut = !!(cpu->sr & ARM_SR_BIT_C);
				break;
			}
			shiftImm &= 0x1F;
			if (!shiftImm) {
				if (s)
					carryOut = ret >> 31;
				break;
			}
			goto ror_imm_32_nonzero;
		case 0b0110:				//ROR imm/RRX
			if (!shiftImm) {	//RRX
				carryOut = ret >> 31;
				ret >>= 1;
				if (cpu->sr & ARM_SR_BIT_C)
					ret |= 0x80000000UL;
				break;
			}
			//fallthrough
		case 0b1110:
	ror_imm_32_nonzero:
			if (s)
				carryOut = (ret >> (shiftImm - 1)) & 1;
			ret = (ret >> shiftImm) | (ret << (32 - shiftImm));
			break;
		default:
			return false;
	}
	
	if (s && carryOutP)
		*carryOutP = carryOut;
	
	*valP = ret;
	
	return true;
}

static int32_t emuCpuSatAdd(struct EmuCpuState* cpu, uint32_t val1, uint32_t val2)	//will set Q bit too
{
	uint32_t ret = val1 + val2;
	
	if (!((val1 ^ val2) & 0x80000000) && ((ret ^ val1) & 0x80000000)) {	//overflow if sources have same sign and result differs
		
		cpu->sr |= ARM_SR_BIT_Q;
		if (val1 & 0x80000000)	//what should the result have been?
			ret = 0x80000000;
		else
			ret = 0x7fffffff;
	}
	
	return ret;
}

static uint32_t emuCpuSatSub(struct EmuCpuState* cpu, uint32_t val1, uint32_t val2)	//will set Q bit too
{
	uint32_t ret = val1 + val2;
	
	if (((val1 ^ val2) & 0x80000000) && ((ret ^ val1) & 0x80000000)) {	//overflow if sources have diff sign and result differs from first reg's sign
		
		cpu->sr |= ARM_SR_BIT_Q;
		if (val1 & 0x80000000)	//what should the result have been?
			ret = 0x80000000;
		else
			ret = 0x7fffffff;
	}
	
	return ret;
}

static inline bool emuCpuDataProcessing(struct EmuCpuState* cpu, uint32_t instr, uint32_t op2, bool s, bool shifterCarryOut)
{
	uint32_t by, wholeSrOut, val = emuCpuRegGet(cpu, (instr >> 16) & 0x0F), val2;	//rn
	bool carryOut = shifterCarryOut;
	uint64_t val64;
	
	switch ((instr >> 21) & 0x0F) {
		case 0b0000: //AND
			val &= op2;
			if (s)
				goto set_shifterCarryOutC_nz;
			goto save_val;
		case 0b0001: //EOR
			val ^= op2;
			if (s)
				goto set_shifterCarryOutC_nz;
			goto save_val;
		case 0b0010: //SUB
			asm(
				".syntax unified	\n\t"	//gcc forces divided syntax for m0. force it back to unified
				"subs %0, %2		\n\t"
				"mrs %1, APSR		\n\t"
				:"+l"(val), "=l"(wholeSrOut)
				:"l"(op2), "0"(val)
				:"cc"
			);
			if (s)
				cpu->sr = wholeSrOut;
			goto save_val;
		case 0b0011: //RSB
			asm(
				".syntax unified	\n\t"	//gcc forces divided syntax for m0. force it back to unified
				"subs %0, %2		\n\t"
				"mrs %1, APSR		\n\t"
				:"=&l"(val), "=l"(wholeSrOut)
				:"l"(val), "0"(op2)
				:"cc"
			);
			if (s)
				cpu->sr = wholeSrOut;
			goto save_val;
		case 0b0100: //ADD
			asm(
				".syntax unified	\n\t"	//gcc forces divided syntax for m0. force it back to unified
				"adds %0, %2		\n\t"
				"mrs %1, APSR		\n\t"
				:"+l"(val), "=l"(wholeSrOut)
				:"l"(op2), "0"(val)
				:"cc"
			);
			if (s)
				cpu->sr = wholeSrOut;
			goto save_val;
		case 0b0101: //ADC
			asm(
				".syntax unified	\n\t"	//gcc forces divided syntax for m0. force it back to unified
				"msr APSR_nzcvq, %3	\n\t"
				"adcs %0, %2		\n\t"
				"mrs %1, APSR		\n\t"
				:"+l"(val), "=l"(wholeSrOut)
				:"l"(op2), "l"(cpu->sr), "0"(val)
				:"cc"
			);
			if (s)
				cpu->sr = wholeSrOut;
			goto save_val;
		case 0b0110: //SBC
			asm(
				".syntax unified	\n\t"	//gcc forces divided syntax for m0. force it back to unified
				"msr APSR_nzcvq, %3	\n\t"
				"sbcs %0, %2		\n\t"
				"mrs %1, APSR		\n\t"
				:"+l"(val), "=l"(wholeSrOut)
				:"l"(op2), "l"(cpu->sr), "0"(val)
				:"cc"
			);
			if (s)
				cpu->sr = wholeSrOut;
			goto save_val;
		case 0b0111: //RSC
			asm(
				".syntax unified	\n\t"	//gcc forces divided syntax for m0. force it back to unified
				"msr APSR_nzcvq, %3	\n\t"
				"sbcs %0, %2		\n\t"
				"mrs %1, APSR		\n\t"
				:"=&l"(val), "=l"(wholeSrOut)
				:"0"(op2), "l"(cpu->sr), "1"(val)
				:"cc"
			);
			if (s)
				cpu->sr = wholeSrOut;
			goto save_val;
		case 0b1000: //TST
			if (!s)
				goto misc_instrs;
			val &= op2;
			goto set_shifterCarryOutC_nz_nowrite;
		case 0b1001: //TEQ
			if (!s)
				goto misc_instrs;
			val ^= op2;
			goto set_shifterCarryOutC_nz_nowrite;
		case 0b1010: //CMP
			if (!s)
				goto misc_instrs;
			asm(
				".syntax unified	\n\t"	//gcc forces divided syntax for m0. force it back to unified
				"cmp %1, %2			\n\t"
				"mrs %0, APSR		\n\t"
				:"=r"(wholeSrOut)
				:"l"(val), "l"(op2)
				:"cc"
			);
			cpu->sr = wholeSrOut;
			goto no_save_val;
		case 0b1011: //CMN
			if (!s)
				goto misc_instrs;
			asm(
				".syntax unified	\n\t"	//gcc forces divided syntax for m0. force it back to unified
				"cmn %1, %2			\n\t"
				"mrs %0, APSR		\n\t"
				:"=r"(wholeSrOut)
				:"l"(val), "l"(op2)
				:"cc"
			);
			cpu->sr = wholeSrOut;
			goto no_save_val;
		case 0b1100: //ORR
			val |= op2;
			if (s)
				goto set_shifterCarryOutC_nz;
			goto save_val;
		case 0b1101: //MOV
			val = op2;
			if (s)
				goto set_shifterCarryOutC_nz;
			goto save_val;
		case 0b1110: //BIC
			val &=~ op2;
			if (s)
				goto set_shifterCarryOutC_nz;
			goto save_val;
		case 0b1111: //MVN
			val = ~op2;
			if (s)
				goto set_shifterCarryOutC_nz;
			goto save_val;
	}
	
set_shifterCarryOutC_nz:
	wholeSrOut = cpu->sr | ARM_SR_BIT_C | ARM_SR_BIT_N | ARM_SR_BIT_Z;		//set all and clear some is faster as "BFC" gets used (or at least should)

	if (!carryOut)
		wholeSrOut &=~ ARM_SR_BIT_C;
	if (!(val >> 31))
		wholeSrOut &=~ ARM_SR_BIT_N;
	if (val)
		wholeSrOut &=~ ARM_SR_BIT_Z;

	cpu->sr = wholeSrOut;

save_val:
	emuCpuRegSet(cpu, (instr >> 12) & 0x0F, val);
	return true;

set_shifterCarryOutC_nz_nowrite:
	wholeSrOut = cpu->sr | ARM_SR_BIT_C | ARM_SR_BIT_N | ARM_SR_BIT_Z;		//set all and clear some is faster as "BFC" gets used (or at least should)

	if (!carryOut)
		wholeSrOut &=~ ARM_SR_BIT_C;
	if (!(val >> 31))
		wholeSrOut &=~ ARM_SR_BIT_N;
	if (val)
		wholeSrOut &=~ ARM_SR_BIT_Z;

	cpu->sr = wholeSrOut;

no_save_val:
	return true;

misc_instrs:	//table 3.3
	//quick check for bx/blx since "BX LR" is a hot path
	if ((instr & 0x0FFFFFD0) == 0x012FFF10) {
		val = emuCpuRegGet(cpu, instr & 0x0F);
		if (instr & 0x00000020)
			emuCpuRegSet(cpu, REG_NO_LR, cpu->regs[REG_NO_PC]);	//already points to next instr
		emuCpuRegSet(cpu, REG_NO_PC, val);
		return true;
	}
	if ((instr & 0x02200000) == 0x02200000) {	//MSR.imm
		
		if ((instr >> 22) & 1)	//SPSR not supported in user mode
			return false;
		
		val = (instr & 0xFF);
		by = (instr >> 7) & 0x1E;
		if (by)
			val = (val >> by) | (val << (32 - by));
				
		if (instr & 0x00080000)
			cpu->sr = (cpu->sr &~ ARM_SR_BITS_APP) | (val & ARM_SR_BITS_APP);
	}
	else switch ((instr >> 4) & 0x0F) {	
		case 0b0000: switch ((instr >> 21) & 0x1F) {
			case 0b01000:
			case 0b01010:	//MRS
				if ((instr >> 22) & 1)	//SPSR not supported in user mode
					return false;
				emuCpuRegSet(cpu, (instr >> 12) & 0x0F, (cpu->sr & ARM_SR_BITS_APP) | ARM_V5_SR_USER_MODE_BITS);
				break;
			case 0b01001:
			case 0b01011:	//MSR.reg
				if ((instr >> 22) & 1)	//SPSR not supported in user mode
					return false;
				if (instr & 0x00080000)
					cpu->sr = (cpu->sr &~ ARM_SR_BITS_APP) | (val & ARM_SR_BITS_APP);
				break;
			default:
				return false;
		}
		break;
		
		case 0b0001: switch ((instr >> 21) & 0x1F) {
			case 0b01011:	//CLZ
				val = emuCpuRegGet(cpu, instr & 0x0F);
				val = val ? __builtin_clz(val) : 32;
				emuCpuRegSet(cpu, (instr >> 12) & 0x0F, val);
				break;
			default:
				return false;
		}
		break;
		
		case 0b0101: switch ((instr >> 21) & 0x1F) {
			case 0b01000:	//QADD
				emuCpuRegSet(cpu, (instr >> 12) & 0x0F, emuCpuSatAdd(cpu, emuCpuRegGet(cpu, (instr >> 0) & 0x0F), emuCpuRegGet(cpu, (instr >> 16) & 0x0F)));
				break;
			case 0b01001:	//QSUB
				emuCpuRegSet(cpu, (instr >> 12) & 0x0F, emuCpuSatSub(cpu, emuCpuRegGet(cpu, (instr >> 0) & 0x0F), emuCpuRegGet(cpu, (instr >> 16) & 0x0F)));
				break;
			case 0b01010:	//QDADD
				val = emuCpuRegGet(cpu, (instr >> 16) & 0x0F);
				val = emuCpuSatAdd(cpu, val, val);
				emuCpuRegSet(cpu, (instr >> 12) & 0x0F, emuCpuSatAdd(cpu, emuCpuRegGet(cpu, (instr >> 0) & 0x0F), val));
				break;
			case 0b01011:	//QDSUB
				val = emuCpuRegGet(cpu, (instr >> 16) & 0x0F);
				val = emuCpuSatAdd(cpu, val, val);
				emuCpuRegSet(cpu, (instr >> 12) & 0x0F, emuCpuSatSub(cpu, emuCpuRegGet(cpu, (instr >> 0) & 0x0F), val));
				break;
			default:
				return false;
		}
		break;
		
		case 0b0111: switch ((instr >> 21) & 0x1F) {
			case 0b01001:	//BKPT
				loge("BKPT in ARM code\n");
				asm("udf #0");
				__builtin_unreachable();
			default:
				return false;
		}
		break;
		
		case 0b1000:	//saturating multiplies
		case 0b1010:
		case 0b1100:
		case 0b1110: switch ((instr >> 21) & 0x1F) {
			case 0b01000:	//SMLAxy
				val = emuCpuRegGet(cpu, (instr >> 0) & 0x0F);	//Rm
				val2 = emuCpuRegGet(cpu, (instr >> 8) & 0x0F);	//Rs
				if (instr & 0x20)
					val >>= 16;
				if (instr & 0x40)
					val2 >>= 16;
				
				val = (int32_t)(int16_t)val * (int32_t)(int16_t)val2;
				emuCpuRegSet(cpu, (instr >> 16) & 0x0F, emuCpuSatAdd(cpu, emuCpuRegGet(cpu, (instr >> 16) & 0x0F), val));
				break;
				
			case 0b01001:	//SMULWy/SMLAWy
				val = emuCpuRegGet(cpu, (instr >> 0) & 0x0F);	//Rm
				val2 = emuCpuRegGet(cpu, (instr >> 8) & 0x0F);	//Rs
				if (instr & 0x40)
					val2 >>= 16;
				val = ((int64_t)(int16_t)val2 * (int64_t)(int32_t)val) >> 16;
				if (!(instr & 0x20))	//SMLAWy
					val = emuCpuSatAdd(cpu, val, emuCpuRegGet(cpu, (instr >> 12) & 0x0F));
				emuCpuRegSet(cpu, (instr >> 16) & 0x0F, val);
				break;
			case 0b01010:	//SMLALxy
				val = emuCpuRegGet(cpu, (instr >> 0) & 0x0F);	//Rm
				val2 = emuCpuRegGet(cpu, (instr >> 8) & 0x0F);	//Rs
				if (instr & 0x20)
					val >>= 16;
				if (instr & 0x40)
					val2 >>= 16;
				
				val64 = emuCpuRegGet(cpu, (instr >> 16) & 0x0F);
				val64 <<= 32;
				val64 += emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
				val64 += (int64_t)(int16_t)val * (int64_t)(int16_t)val2;
				
				emuCpuRegSet(cpu, (instr >> 12) & 0x0F, val64);
				emuCpuRegSet(cpu, (instr >> 16) & 0x0F, val64 >> 32);
				break;
				
			case 0b01011:	//SMULxy
				val = emuCpuRegGet(cpu, (instr >> 0) & 0x0F);	//Rm
				val2 = emuCpuRegGet(cpu, (instr >> 8) & 0x0F);	//Rs
				if (instr & 0x20)
					val >>= 16;
				if (instr & 0x40)
					val2 >>= 16;
				
				val = (int32_t)(int16_t)val * (int32_t)(int16_t)val2;
				emuCpuRegSet(cpu, (instr >> 16) & 0x0F, val);
				break;
			
			default:
				return false;
		}
		break;
		
		default:
			return false;
	}
	return true;
	
	//SWI, eDSP instrs unsupported
	return false;
}

static inline bool emuCpuMultipliesAndExtraLoadsAndStores(struct EmuCpuState* cpu, uint32_t instr)
{
	irq_state_t sta;
	
	if ((instr >> 5) & 3) {	//mode 3 ops
		uint32_t ea, idx, wbVal = 0;
		bool wb = false;
		
		ea = emuCpuRegGet(cpu, (instr >> 16) & 0x0F);
		idx = (instr & 0x00400000UL) ? ((instr & 0x0F) | ((instr >> 4) & 0xF0)) : emuCpuRegGet(cpu, instr & 0x0F);
		
		if (!(instr & 0x00800000UL))
			idx = -idx;
		
		if (!(instr & 0x01000000UL)) {	//post indexed
			wbVal = ea + idx;
			wb = true;
		}
		else {
			ea += idx;
			if (instr & 0x00200000UL) {
				wbVal = ea;
				wb = true;
			}
		}
		
		if (instr & 0x00100000UL) {	//load
			switch ((instr >> 5) & 3) {
				case 0b00:	//unsigned byte - not encoded this way
					return false;
				case 0b01:	//LDRH
					emuCpuRegSet(cpu, (instr >> 12) & 0x0F, *(uint16_t*)ea);
					break;
				case 0b10:	//LDRSB
					emuCpuRegSet(cpu, (instr >> 12) & 0x0F, (int32_t)*(int8_t*)ea);
					break;
				case 0b11:	//LDRSH:
					emuCpuRegSet(cpu, (instr >> 12) & 0x0F, (int32_t)*(int16_t*)ea);
					break;
			}		
		}
		else {						//store (or LDRD)
			switch ((instr >> 5) & 3) {
				case 0b00:	//unsigned byte - not encoded this way
					return false;
				case 0b01:	//STRH
					*(uint16_t*)ea = emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
					break;
				case 0b10:	//LDRD
					if ((instr >> 12) & 1)
						return false;
					emuCpuRegSet(cpu, ((instr >> 12) & 0x0F) + 0, *(uint32_t*)(ea + 0));
					emuCpuRegSet(cpu, ((instr >> 12) & 0x0F) + 1, *(uint32_t*)(ea + 4));
					break;
				case 0b11:	//STRD
					if ((instr >> 12) & 1)
						return false;
					*(uint32_t*)(ea + 0) = emuCpuRegGet(cpu, ((instr >> 12) & 0x0F) + 0);
					*(uint32_t*)(ea + 4) = emuCpuRegGet(cpu, ((instr >> 12) & 0x0F) + 1);
					break;
			}
		}
		
		//wbak?
		if (wb)
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, wbVal);
	}
	else {
		uint32_t wholeSrOut, storeFail, val = 0, val32;
		uint64_t tmp64 = 0;
		bool s = false;
		uint8_t val8;
		
		switch ((instr >> 20) & 0x1F) {
			case 0b00001:		//MULS
				s = true;
				goto domul;
			case 0b00011:		//MLAS
				s = true;
				//fallthrough
			case 0b00010:		//MLA
				val = emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
				//fallthrough
			case 0b00000:		//MUL
		domul:
				val += emuCpuRegGet(cpu, instr & 0x0F) * emuCpuRegGet(cpu, (instr >> 8) & 0x0F);
				if (s) {
					wholeSrOut = cpu->sr | ARM_SR_BIT_N | ARM_SR_BIT_Z;
					if (!(val >> 31))
						wholeSrOut &=~ ARM_SR_BIT_N;
					if (val)
						wholeSrOut &=~ ARM_SR_BIT_Z;
					cpu->sr = wholeSrOut;
				}
				emuCpuRegSet(cpu, (instr >> 16) & 0x0F, val);
				break;
			
			case 0b01011:		//UMLALS
				s = true;
				//fallthrough
			case 0b01010:		//UMLAL
				tmp64 = emuCpuRegGet(cpu, (instr >> 16) & 0x0F);
				tmp64 <<= 32;
				tmp64 |= emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
				//fallthrough
			case 0b01000:		//UMULL
		lmul_u:
				tmp64 += (uint64_t)emuCpuRegGet(cpu, (instr >> 8) & 0x0F) * (uint64_t)emuCpuRegGet(cpu, instr & 0x0F);
		lmul_set_flags_and_save:
				if (s) {
					wholeSrOut = cpu->sr | ARM_SR_BIT_N | ARM_SR_BIT_Z;
					if (!(tmp64 >> 63))
						wholeSrOut &=~ ARM_SR_BIT_N;
					if (tmp64)
						wholeSrOut &=~ ARM_SR_BIT_Z;
					cpu->sr = wholeSrOut;
				}
				emuCpuRegSet(cpu, (instr >> 16) & 0x0F, tmp64 >> 32);
				emuCpuRegSet(cpu, (instr >> 12) & 0x0F, tmp64);
				break;
			case 0b01001:		//UMULLS
				s = true;
				goto lmul_u;
			
			case 0b01111:		//SMLALS
				s = true;
				//fallthrough
			case 0b01110:		//SMLAL
				tmp64 = emuCpuRegGet(cpu, (instr >> 16) & 0x0F);
				tmp64 <<= 32;
				tmp64 |= emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
				//fallthrough
			case 0b01100:		//SMULL
		lmul_s:
				tmp64 += (int64_t)emuCpuRegGet(cpu, (instr >> 8) & 0x0F) * (int64_t)emuCpuRegGet(cpu, instr & 0x0F);
				goto lmul_set_flags_and_save;
				break;
			case 0b01101:		//SMULLS
				s = true;
				goto lmul_s;
			
			case 0b10000:		//SWP
			
				atomicSwap32(&val32, (uint32_t*)emuCpuRegGet(cpu, (instr >> 12) & 0x0F), emuCpuRegGet(cpu, instr & 0x0F));
				emuCpuRegSet(cpu, (instr >> 16) & 0x0F, val32);
				break;
				
			case 0b10100:		//SWPB
				
				atomicSwap8(&val8, (uint8_t*)emuCpuRegGet(cpu, (instr >> 12) & 0x0F), emuCpuRegGet(cpu, instr & 0x0F));
				emuCpuRegSet(cpu, (instr >> 16) & 0x0F, val8);
				break;
				
			default:
				return false;
		}
	}
	
	return true;
}

static inline bool emuCpuMode2(struct EmuCpuState* cpu, uint32_t instr)
{
	uint32_t base = emuCpuRegGet(cpu, (instr >> 16) & 0x0F);
	uint32_t ofst;
	
	if (!(instr & 0x02000000UL))
		ofst = instr & 0x0FFF;
	else {
		uint32_t by = (instr >> 7) & 0x1F;
		
		ofst = emuCpuRegGet(cpu, instr & 0x0F);
		
		switch ((instr >> 5) & 3) {
		
			case 0b00:
				ofst <<= by;
				break;
			case 0b01:
				ofst = by ? (ofst >> by) : 0;
				break;
			case 0b10:
				if (!by)
					by = 31;	//ASR 31 is same as ASR 32 :)
				ofst = ((int32_t)ofst) >> by;
				break;
			case 0b11:
				if (by)
					ofst = (ofst >> by) | (ofst << (32 - by));
				else {
					ofst >>= 1;
					if (cpu->sr & ARM_SR_BIT_C)
						ofst |= 0x80000000UL;
				}
				break;
		}
	}
	
	switch ((instr >> 20) & 0x1F) {
		
		case 0b00000:	//STR Rd, [Rn], -ofst
			*(uint32_t*)base = emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base - ofst);
			break;
		case 0b00001:	//LDR Rd, [Rn], -ofst
			emuCpuRegSet(cpu, (instr >> 12) & 0x0F, *(uint32_t*)base);
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base - ofst);
			break;
		case 0b00010:	//STRT Rd, [Rn], -ofst
			//not supported
			return false;
		case 0b00011:	//LDRT Rd, [Rn], -ofst
			//not supported
			return false;
		case 0b00100:	//STRB Rd, [Rn], -ofst
			*(uint8_t*)base = emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base - ofst);
			break;
		case 0b00101:	//LDRB Rd, [Rn], -ofst
			emuCpuRegSet(cpu, (instr >> 12) & 0x0F, *(uint8_t*)base);
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base - ofst);
			break;
		case 0b00110:	//STRBT Rd, [Rn], -ofst
			//not supported
			return false;
		case 0b00111:	//LDRBT Rd, [Rn], -ofst
			//not supported
			return false;
		case 0b01000:	//STR Rd, [Rn], +ofst
			*(uint32_t*)base = emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base + ofst);
			break;
		case 0b01001:	//LDR Rd, [Rn], +ofst
			emuCpuRegSet(cpu, (instr >> 12) & 0x0F, *(uint32_t*)base);
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base + ofst);
			break;
		case 0b01010:	//STRT Rd, [Rn], +ofst
			//not supported
			return false;
		case 0b01011:	//LDRT Rd, [Rn], +ofst
			//not supported
			return false;
		case 0b01100:	//STRB Rd, [Rn], +ofst
			*(uint8_t*)base = emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base + ofst);
			break;
		case 0b01101:	//LDRB Rd, [Rn], +ofst
			emuCpuRegSet(cpu, (instr >> 12) & 0x0F, *(uint8_t*)base);
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base + ofst);
			break;
		case 0b01110:	//STRBT Rd, [Rn], +ofst
			//not supported
			return false;
		case 0b01111:	//LDRBT Rd, [Rn], +ofst
			//not supported
			return false;
		case 0b10000:	//STR Rd, [Rn, -ofst]
			*(uint32_t*)(base - ofst) = emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
			break;
		case 0b10001:	//LDR Rd, [Rn, -ofst]
			emuCpuRegSet(cpu, (instr >> 12) & 0x0F, *(uint32_t*)(base - ofst));
			break;
		case 0b10010:	//STR Rd, [Rn, -ofst]!
			*(uint32_t*)(base - ofst) = emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base - ofst);
			break;
		case 0b10011:	//LDR Rd, [Rn, -ofst]!
			emuCpuRegSet(cpu, (instr >> 12) & 0x0F, *(uint32_t*)(base - ofst));
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base - ofst);
			break;
		case 0b10100:	//STRB Rd, [Rn, -ofst]
			*(uint8_t*)(base - ofst) = emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
			break;
		case 0b10101:	//LDRB Rd, [Rn, -ofst]
			emuCpuRegSet(cpu, (instr >> 12) & 0x0F, *(uint8_t*)(base - ofst));
			break;
		case 0b10110:	//STRB Rd, [Rn, -ofst]!
			*(uint8_t*)(base - ofst) = emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base - ofst);
			break;
		case 0b10111:	//LDRB Rd, [Rn, -ofst]!
			emuCpuRegSet(cpu, (instr >> 12) & 0x0F, *(uint8_t*)(base - ofst));
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base - ofst);
			break;
		case 0b11000:	//STR Rd, [Rn, +ofst]
			*(uint32_t*)(base + ofst) = emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
			break;
		case 0b11001:	//LDR Rd, [Rn, +ofst]
			emuCpuRegSet(cpu, (instr >> 12) & 0x0F, *(uint32_t*)(base + ofst));
			break;
		case 0b11010:	//STR Rd, [Rn, +ofst]!
			*(uint32_t*)(base + ofst) = emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base + ofst);
			break;
		case 0b11011:	//LDR Rd, [Rn, +ofst]!
			emuCpuRegSet(cpu, (instr >> 12) & 0x0F, *(uint32_t*)(base + ofst));
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base + ofst);
			break;
		case 0b11100:	//STRB Rd, [Rn, +ofst]
			*(uint8_t*)(base + ofst) = emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
			break;
		case 0b11101:	//LDRB Rd, [Rn, +ofst]
			emuCpuRegSet(cpu, (instr >> 12) & 0x0F, *(uint8_t*)(base + ofst));
			break;
		case 0b11110:	//STRB Rd, [Rn, +ofst]!
			*(uint8_t*)(base + ofst) = emuCpuRegGet(cpu, (instr >> 12) & 0x0F);
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base + ofst);
			break;
		case 0b11111:	//LDRB Rd, [Rn, +ofst]!
			emuCpuRegSet(cpu, (instr >> 12) & 0x0F, *(uint8_t*)(base + ofst));
			emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base + ofst);
			break;
		default:
			//not reached
			return false;
	}
	
	return true;
}

static inline bool emuCpuLdmStm(struct EmuCpuState* cpu, uint32_t instr)
{
	int32_t i, incrBefore = 0, incrAfter = 0, countFrom, countTo, countIncr;
	uint32_t base = emuCpuRegGet(cpu, (instr >> 16) & 0x0F);
	
#ifdef ARM_EMU_DEFINE_THE_UNDEFINED
	if ((instr & 0x00508000) == 0x00508000)	{
		//Bejeweled likes to use shit like "LDMDB   R11, {R4-R8,R11,SP,PC}^" to
		// return from armlets. This is undefined in user and system mode since there
		// is no SPSR to copy to CPSR in that mode. Apparently works in user mode
		// by just pretending that the S bit is clear. Verified on real HW.
		// this code path does the same by clearing the S bit if it is set and PC
		// is in the register list for an LDM 
		instr &=~ 0x00400000;
	}
#endif
	
	switch ((instr >> 22) & 7) {	//figure out addressing mode & also discard all LDM/STM with S bit
		case 0b000:	//DA
			incrAfter = -4;
			countFrom = 15;
			countTo = -1;
			countIncr = -1;
			break;
		case 0b010:	//IA
			incrAfter = 4;
			countFrom = 0;
			countTo = 16;
			countIncr = 1;
			break;
		case 0b100:	//DB
			incrBefore = -4;
			countFrom = 15;
			countTo = -1;
			countIncr = -1;
			break;
		case 0b110:	//IB
			incrBefore = 4;
			countFrom = 0;
			countTo = 16;
			countIncr = 1;
			break;
		default:		//all cases with the S bit set are user mode transfers that we do not support
			return false;
	}
	
	for (i = countFrom; i != countTo; i += countIncr) {
		
		
		if (instr & (1 << i)) {
			
			base += incrBefore;
			if (instr & 0x00100000)					//load
				emuCpuRegSet(cpu, i, *(uint32_t*)base);
			else									//store
				*(uint32_t*)base = emuCpuRegGet(cpu, i);
			base += incrAfter;
		}
	}
	
	//wbak?
	if (instr & 0x00200000)
		emuCpuRegSet(cpu, (instr >> 16) & 0x0F, base);
	
	return true;
}


#ifdef ARM_EMU_USE_PATTERN_MATCHER
	static bool emuMemMatchByWord(const void* aP, const void* bP, uint32_t bytes)	//assumes we need at least one word checked (for speed)
	{
		const uint32_t *a = aP, *b = bP;
		uint32_t words = bytes / 4;
		
		do {
			if (*a++ != *b++)
				return false;
		} while (--words);
	
	
		return true;
	}
#endif

static bool __attribute__((noinline)) emuCpuPatMatch(struct EmuCpuState* cpu)	//retutrn true if anything was modified in context
{
	#ifdef ARM_EMU_USE_PATTERN_MATCHER
		
		uint32_t *instrs = (uint32_t*)cpu->regs[REG_NO_PC];
		
		//ADS's udivmod
		{
			static const uint32_t first[] = {0xE3A02000, 0xE070C1A1, 0x3A000020, 0xE070C421, 0x3A00000F, 0xE1A00400, 0xE38224FF, 0xE070C221, 0x3A000017, 0xE070C421, 0x3A000009, 0xE1A00400, 0xE38228FF, 0xE070C421, 0x21A00400, 0x23822CFF, 0xE070C221, 0x3A00000E, 0xE270C000};
			static const uint32_t second[] = {0x21A00420, 0xE070C3A1, 0x20411380, 0xE0A22002, 0xE070C321, 0x20411300, 0xE0A22002, 0xE070C2A1, 0x20411280, 0xE0A22002, 0xE070C221, 0x20411200, 0xE0A22002, 0xE070C1A1, 0x20411180, 0xE0A22002, 0xE070C121, 0x20411100, 0xE0A22002, 0xE070C0A1, 0x20411080, 0xE0A22002, 0xE070C001, 0x20411000, 0xE0B22002, 0x2AFFFFE5, 0xE1A00002, 0xE12FFF1E};
			
			if (emuMemMatchByWord(instrs, first, sizeof(first)) && (instrs[sizeof(first) / 4] >> 24) == 0x2A && emuMemMatchByWord(instrs + sizeof(first) / 4 + 1, second, sizeof(second))) {
				
				//pr("ADS udivmod matched at 0x%08x from 0%08x (%u/%u)\n", instrs, cpu->regs[REG_NO_LR], cpu->regs[1], cpu->regs[0]);
				//(r1 / r0) -> (r0 = quo, r1 = rem)
				uint32_t num, denom, q, r;
				
				num = cpu->regs[1];
				denom = cpu->regs[0];
			
				if (denom) {
					q = num / denom;
					r = num % denom;
					
					cpu->regs[0] = q;
					cpu->regs[1] = r;
					cpu->regs[REG_NO_PC] = cpu->regs[REG_NO_LR];
					return true;
				}
				//else let div by zero happen the native way
				return false;
			}
		}
		
		//ADS's sdivmod
		{
			static const uint32_t first[] = {0xE2102480, 0x42600000, 0xE0323041, 0x22611000, 0xE070C1A1, 0x3A000020, 0xE070C421, 0x3A00000F, 0xE1A00400, 0xE38224FF, 0xE070C221, 0x3A000017, 0xE070C421, 0x3A000009, 0xE1A00400, 0xE38228FF, 0xE070C421, 0x21A00400, 0x23822CFF, 0xE070C221, 0x3A00000E, 0xE270C000};
			static const uint32_t second[] = {0x21A00420, 0xE070C3A1, 0x20411380, 0xE0A22002, 0xE070C321, 0x20411300, 0xE0A22002, 0xE070C2A1, 0x20411280, 0xE0A22002, 0xE070C221, 0x20411200, 0xE0A22002, 0xE070C1A1, 0x20411180, 0xE0A22002, 0xE070C121, 0x20411100, 0xE0A22002, 0xE070C0A1, 0x20411080, 0xE0A22002, 0xE070C001, 0x20411000, 0xE0B22002, 0x2AFFFFE5, 0xE0320FC3, 0xE0800FA3, 0x22611000, 0xE12FFF1E};
			
			if (emuMemMatchByWord(instrs, first, sizeof(first)) && (instrs[sizeof(first) / 4] >> 24) == 0x2A && emuMemMatchByWord(instrs + sizeof(first) / 4 + 1, second, sizeof(second))) {
				
				//pr("ADS sdivmod matched at 0x%08x from 0%08x (%d/%d)\n", instrs, cpu->regs[REG_NO_LR], cpu->regs[1], cpu->regs[0]);
				//(r1 / r0) -> (r0 = quo, r1 = rem)
				int32_t num, denom, q, r;
				
				num = cpu->regs[1];
				denom = cpu->regs[0];
		
				if (denom) {
					q = num / denom;
					r = num % denom;
					
					cpu->regs[0] = q;
					cpu->regs[1] = r;
					cpu->regs[REG_NO_PC] = cpu->regs[REG_NO_LR];
					return true;
				}
				//else let div by zero happen the native way
				return false;
			}
		}
	
		//ADS's udiv10
		{
			static const uint32_t match[] = {0xE240100A, 0xE0400120, 0xE0800220, 0xE0800420, 0xE0800820, 0xE1A001A0, 0xE0802100, 0xE0511082, 0x52800001, 0x4281100A, 0xE12FFF1E};
			
			if (emuMemMatchByWord(instrs, match, sizeof(match))) {
				
				//pr("ADS udiv10 matched at 0x%08x from 0%08x (%u/10)\n", instrs, cpu->regs[REG_NO_LR], cpu->regs[0]);
				//(r0 / 10) -> (r0 = quo, r1 = rem)
				uint32_t n = cpu->regs[0];
				
				cpu->regs[0] = n / 10;
				cpu->regs[1] = n % 10;
				cpu->regs[REG_NO_PC] = cpu->regs[REG_NO_LR];
				return true;
			}
		}
	
		//ADS's sdiv10
		{
			static const uint32_t match[] = {0xE1B03000, 0x42600000, 0xE240100A, 0xE0400120, 0xE0800220, 0xE0800420, 0xE0800820, 0xE1A001A0, 0xE0802100, 0xE0511082, 0x52800001, 0x4281100A, 0xE1B03003, 0x42600000, 0x42611000, 0xE12FFF1E};
			
			if (emuMemMatchByWord(instrs, match, sizeof(match))) {
				
				//pr("ADS sdiv10 matched at 0x%08x from 0%08x (%d/10)\n", instrs, cpu->regs[REG_NO_LR], cpu->regs[0]);
				//(r0 / 10) -> (r0 = quo, r1 = rem)
				int32_t n = cpu->regs[0];
				
				cpu->regs[0] = n / 10;
				cpu->regs[1] = n % 10;
				cpu->regs[REG_NO_PC] = cpu->regs[REG_NO_LR];
				return true;
			}
		}
		
		//ADS's memcpy
		{
			static const uint32_t match[] = {0xE92D4010, 0xE2522020, 0x3A000005, 0x28B15018, 0x28A05018, 0x28B15018, 0x28A05018, 0x22522020, 0x2AFFFFF9, 0xE1B0CE02, 0x28B15018, 0x28A05018, 0x48B10018, 0x48A00018, 0xE8BD4010, 0xE1B0CF02, 0x24913004, 0x24803004, 0x012FFF1E, 0xE1B02F82, 0x44D12001, 0x24D13001, 0x24D1C001, 0x44C02001, 0x24C03001, 0x24C0C001, 0xE12FFF1E};
			
			if (emuMemMatchByWord(instrs, match, sizeof(match))) {
				
				//pr("ADS aligned memcpy matched at 0x%08x from 0%08x (0x%x bytes from 0x%08x to 0x%08x)\n", instrs, cpu->regs[REG_NO_LR], cpu->regs[2], cpu->regs[0], cpu->regs[1]);
				//we do this as it is faster than our memcpy sinc eours doesnt know it is aligned
				uint32_t *src = (uint32_t*)cpu->regs[1];
				uint32_t *dst = (uint32_t*)cpu->regs[0];
				uint32_t bytes = cpu->regs[2];
				
				memcpy(dst, src, bytes);
				cpu->regs[REG_NO_PC] = cpu->regs[REG_NO_LR];
				return true;
			}
		}
	#endif
	return false;
}


#ifdef DEBUG_PACE_EMULATION
	#include "boot.h"
	#include "halTrace.h"
	#include <StringMgr.h>
	void impl_HALDbgMessage(const char*);
	void __attribute__((noinline)) paceReport(const struct EmuCpuState* state)
	{
		static const char hexch[] = "0123456789abcdef";
		static const char flg[] = "CVZNX";
		const uint32_t m68kPC = state->regs[10] - 2, *d = (const uint32_t*)(state->regs[5]), *a = (const uint32_t*)(state->regs[6]);
		uint16_t m68kInstr = state->regs[8], m68kSr = state->regs[4];
		const uint8_t *m68kSP = (const uint8_t*)(a[7]);
		static char msg[128];
		unsigned i;
			
		spr(msg, "[%08lX] = %04X, CCR=", m68kPC, m68kInstr);
		for (i = 0; i < 5; i++) {
			char *dst = msg + strlen(msg);
			
			if (m68kSr & (1 << i))
				*dst++ = flg[i];
			else
				*dst++ = '-';
			*dst++ = 0;
		}
		impl_HALDbgMessage(msg);
		
		strcpy(msg, "  D=");
		for (i = 0; i < 8; i++)
			spr(msg + strlen(msg), "%08lX ", d[i]);
		impl_HALDbgMessage(msg);
		strcpy(msg, "  A=");
		for (i = 0; i < 8; i++)
			spr(msg + strlen(msg), "%08lX ", a[i]);
		impl_HALDbgMessage(msg);
		strcpy(msg, "  S=");
		for (i = 0; i < 32; i++)
			spr(msg + strlen(msg), "%c%c ", hexch[m68kSP[i] >> 4], hexch[m68kSP[i] & 15]);
		impl_HALDbgMessage(msg);
	}
#endif

struct EmuCpuState* __attribute__((used)) emuCpuRun(struct EmuCpuState* state)			//return state
{
	uint32_t instr, tmp, expectedPc = 1;	//make sure it will not match at first so we can pattern match
	bool s, carry = false;
	
	schedSetCurThreadEmuCtx(state);
	while(1) {
		
		tmp = state->regs[REG_NO_PC];
		
		if (expectedPc != tmp) {		//only check if nonlinearity in PC seen
			
			if (tmp & 1)				//break in case of thumb
				goto out;
			if (emuCpuPatMatch(state))
				continue;
		}

		state->regs[REG_NO_PC] = expectedPc = tmp + 4;
		instr = *(uint32_t*)tmp;

		#ifdef DEBUG_PACE_EMULATION
			if (instr == 0xe79b0520 && ((uint32_t*)tmp)[1] == 0xe0da70b2 &&  ((uint32_t*)tmp)[2] == 0xe08bf000)
				paceReport(state);
		#endif
		
		//eval cond
		tmp = state->sr;
		switch (instr >> 28) {
			case 0x00:		//EQ
				if (!(tmp & ARM_SR_BIT_Z))
					continue;
				break;
			case 0x01:		//NE
				if (tmp & ARM_SR_BIT_Z)
					continue;
				break;
			case 0x02:		//CS
				if (!(tmp & ARM_SR_BIT_C))
					continue;
				break;
			case 0x03:		//CC
				if (tmp & ARM_SR_BIT_C)
					continue;
				break;
			case 0x04:		//MI
				if (!(tmp & ARM_SR_BIT_N))
					continue;
				break;
			case 0x05:		//PL
				if (tmp & ARM_SR_BIT_N)
					continue;
				break;
			case 0x06:		//VS
				if (!(tmp & ARM_SR_BIT_V))
					continue;
				break;
			case 0x07:		//VC
				if (tmp & ARM_SR_BIT_V)
					continue;
				break;
			case 0x08:		//HI
				if ((tmp & ARM_SR_BIT_Z) || !(tmp & ARM_SR_BIT_C))
					continue;
				break;
			case 0x09:		//LS
				tmp &= (ARM_SR_BIT_C | ARM_SR_BIT_Z);
				if (tmp == ARM_SR_BIT_C)
					continue;
				break;
			case 0x0A:		//GE
				tmp &= (ARM_SR_BIT_N | ARM_SR_BIT_V);
				if (tmp && (tmp != (ARM_SR_BIT_N | ARM_SR_BIT_V)))
					continue;
				break;
			case 0x0B:		//LT
				tmp &= (ARM_SR_BIT_N | ARM_SR_BIT_V);
				if (!tmp || (tmp == (ARM_SR_BIT_N | ARM_SR_BIT_V)))
					continue;
				break;
			case 0x0C:		//GT
				tmp &= (ARM_SR_BIT_N | ARM_SR_BIT_V);
				if ((state->sr & ARM_SR_BIT_Z) || (tmp && (tmp != (ARM_SR_BIT_N | ARM_SR_BIT_V))))
					continue;
				break;
			case 0x0D:		//LE
				tmp &= (ARM_SR_BIT_N | ARM_SR_BIT_V);
				if (!(state->sr & ARM_SR_BIT_Z) && (!tmp || (tmp == (ARM_SR_BIT_N | ARM_SR_BIT_V))))
					continue;
				break;
			case 0x0E:		//AL
				break;
			case 0x0F:		//NV
				if ((instr >> 25) == 0x7D) {					//BLX imm
					
					state->regs[REG_NO_LR] = emuCpuRegGet(state, REG_NO_PC) - 4;
					tmp = ((int32_t)(instr << 8)) >> 6;
					if (instr & 0x01000000)
						tmp += 2;
					state->regs[REG_NO_PC] = emuCpuRegGet(state, REG_NO_PC) + tmp + 1;	//1 is the T bit
					goto out;	//always goes to thumb mode so might as well just go there
				}
				else if ((instr & 0xFD70F000) == 0xF550F000)	//PLD
					continue;
				else
					goto undef;
			default:
				//unreached
				goto undef;
		}
		
		//we got this far - time to execute
		switch ((instr >> 24) & 0x0F) {
			case 0x00:
			case 0x01:
				if (!emuCpuAddrMode1reg(state, instr, &tmp, &s, &carry)) {		//multiplies, extra loads and stores
					if (!emuCpuMultipliesAndExtraLoadsAndStores(state, instr))
						goto undef;
					break;
				}
				goto data_processing;
				break;
			case 0x02:
			case 0x03:
				emuCpuAddrMode1imm(state, instr, &tmp, &s, &carry);
		data_processing:
				if (!emuCpuDataProcessing(state, instr, tmp, s, carry))
					goto undef;
				break;
			case 0x04:
			case 0x05:
			case 0x06:
			case 0x07:
				if (!emuCpuMode2(state, instr))
					goto undef;
				break;
			case 0x08:
			case 0x09:
				if (!emuCpuLdmStm(state, instr))
					goto undef;
				break;
			case 0x0B:
				emuCpuRegSet(state, REG_NO_LR, emuCpuRegGet(state, REG_NO_PC) - 4);	//PC's been updated so this works
				//fallthrough
			case 0x0A:
				tmp = ((int32_t)(instr << 8)) >> 6;
				state->regs[REG_NO_PC] = tmp + emuCpuRegGet(state, REG_NO_PC);
				break;
			
			case 0x0F:
				if ((instr & 0x00ffffff) == 0x123456 && kernelSemihostingHandle(&state->regs[0], &state->regs[1], &state->regs[2], &state->regs[3], state->regs[12], state->regs[REG_NO_SP], state->regs[REG_NO_LR], state->regs[REG_NO_PC], true))
					break;
				//fallthrough
				
			case 0x0C:
			case 0x0D:
				//coprocessor loads/store/double reg xfers
				//fallthrogh
			case 0x0E:
				//coproc data processing, coproc reg xfers
				//fallthrough
			default:
	undef:
				emuCpuUndefInstr(state, emuCpuRegGet(state, REG_NO_PC) - 4, instr, "INVALID INSTRUCTION");
				break;
		}
	}

out:
	//ctx->cpu.regs[REG_NO_PC] has low bit set now. we WANT this for our return method
	return state;		//no need to fix PC as we get here with PC properly set
}
