#include <machine/bootinfo.h>
#include <machine/vcacheregs.h>
#include <mips/asm.h>
#include <mips/cpuregs.h>
#include "tty.h"

	.text
	.globl  start
start:
	.set noreorder
	.set mips32
#ifdef __GP_SUPPORT__
        la	gp, _C_LABEL (_gp)
#endif
	li	t0, (VC_TLB_EN_ICACHE | VC_TLB_EN_DCACHE)
	mtc2	t0, $VC_TLB_EN, 0

	li	a0,  16
	la	a1, _C_LABEL (cpu_bitmaps)
1:
	sw	zero, 0(a1) #CPUs wait for this to be non-0
	addiu	a0, a0, -1
	bne	a0, zero, 1b
	addiu	a1, a1, 4
	sync

	mfc0	a0, MIPS_COP_0_EBASE, 1
	andi	a0, a0, 0x1ff
	bne	a0, zero, seccpu
	nop

	li	a0, 'D'
	la	a1, TTY_BASE
	sb	a0, TTY_WRITE(a1)
	la	sp, _C_LABEL(stk)	#setup stack
	la	a0, _C_LABEL (edata)
	move	a1, zero
	la	a2, _C_LABEL(end)	# memset(edata, 0, end - edata)
	jal	_C_LABEL(memset)
	subu	a2, a2, a0

	la	a0, _C_LABEL (cpu_vectors)
	li	a1, 1
	li	a2, BOOTINFO_NCPUS_MAX * 4
	jal	_C_LABEL(memset)	# memset(cpu_vectors, 1, 512 * 4)
	nop

	mfc0	a0, MIPS_COP_0_EBASE, 1
	andi	a0, a0, 0x1ff
	sll	t0, a0, 2
	la	a1, _C_LABEL (cpu_vectors)
	add	a1, a1, t0
	sw	zero, 0(a1) # cpu_vectors[cpunum] <- 0

	andi	a1, a0, 0x1e0
	srl	a1, a1, 3
	la	t0, _C_LABEL (cpu_bitmaps)
	add	t0, t0, a1   # t0 <- &cpu_bitmaps[cpunum / 32]
	li	a1, 1
	andi	a0, a0, 0x1f
	sllv	a1, a1, a0   # a1 <- 1 << (cpunum % 32)
	sw	a1, 0(t0)    #set bit in cpu_bitmaps[] for boot CPU and release
	sync		     #secondary CPUs

	li	a0, 'E'
	la	a1, TTY_BASE
	sb	a0, TTY_WRITE(a1)
	jal	_C_LABEL(main)
	nop
	li	a0, 'F'
	la	a1, TTY_BASE
	sb	a0, TTY_WRITE(a1)
l:
	j	l

seccpu:
	li	a0,  16
	la	a1, _C_LABEL (cpu_bitmaps)
1:
	lw	t0, 0(a1) 
	bne	t0, zero, secstart #one word of cpu_bitmaps[] != 0: boot CPU ready
	addiu	a0, a0, -1
	bne	a0, zero, 1b
	addiu	a1, a1, 4
	j	seccpu
	nop

secstart:
	mfc0	s0, MIPS_COP_0_EBASE, 1
	andi	s0, s0, 0x1ff
	andi	a1, s0, 0x1e0
	srl	a1, a1, 3
	la	t0, _C_LABEL (cpu_bitmaps)
	add	t0, t0, a1   # t0 <- cpu_bitmaps[cpunum / 32]
	li	a1, 1
	andi	a0, s0, 0x1f
	sllv	a1, a1, a0   # a1 <- 1 << (cpunum % 32)
2:
	ll	a0, 0(t0)
	or	a0, a0, a1
	sc	a0, 0(t0)    #set bit in cpu_bitmaps[] 
	beq	a0, zero, 2b
	nop
			     
	addi	t0, s0, '0';
	la	a1, TTY_BASE
	sb	t0, TTY_WRITE(a1)

	#cpu_vectors[] accessible now, store 0 and wait for jump addr
	sll	t0, s0, 2
	la	a1, _C_LABEL (cpu_vectors)
	add	a1, a1, t0
	sw	zero, 0(a1)
	sync

cpuwait:
	lw	t0, 0(a1)
	beq	t0, zero, cpuwait
	nop
	j	t0	#jump at address software stored in cpu_vectors[n]

	.data
	.global stk
	.space 1024		# 512 bytes for stack
stk:	.space 1
