19 files changed, 1807 insertions, 0 deletions
diff --git a/libmeidogte/Makefile b/libmeidogte/Makefile
new file mode 100644
index 0000000..938f3c2
--- /dev/null
+++ b/libmeidogte/Makefile
@@ -0,0 +1,26 @@
+include ../Makefile.cfg
+
+CFILES	= $(notdir $(wildcard ./*.c))
+AFILES	= $(notdir $(wildcard ./*.s))
+OFILES	= $(CFILES:.c=.o) $(AFILES:.s=.o)
+
+TARGET	= libmeidogte.a
+
+all: $(TARGET)
+
+$(TARGET): $(OFILES)
+	$(AR) cr $(TARGET) $(OFILES)
+	$(RANLIB) $(TARGET)
+
+%.o: %.c
+	$(CC) $(CFLAGS) -I./ -c $< -o $@
+	
+%.o: %.s
+	$(AS) $(AFLAGS) -I ./ $< -o $@
+
+clean:
+	rm -Rf $(TARGET) $(OFILES)
+
+install: all
+	cp $(TARGET) $(TOOLCHAIN_PREFIX)/lib
+	cp meidogte.h meidogte_inline.h $(TOOLCHAIN_PREFIX)/include
diff --git a/libmeidogte/applymatrixlv.s b/libmeidogte/applymatrixlv.s
new file mode 100644
index 0000000..332a2f8
--- /dev/null
+++ b/libmeidogte/applymatrixlv.s
@@ -0,0 +1,40 @@
+.set noreorder
+
+.include "gtereg.h"
+.include "inline_s.h"
+
+.section .text
+
+
+.global ApplyMatrixLV
+.type ApplyMatrixLV, @function
+ApplyMatrixLV:
+
+	# Load matrix to GTE
+	lw		$t0, 0($a0)
+	lw		$t1, 4($a0)
+	ctc2	$t0, $0
+	ctc2	$t1, $1
+	lw		$t0, 8($a0)
+	lw		$t1, 12($a0)
+	lhu		$t2, 16($a0)
+	ctc2	$t0, $2
+	ctc2	$t1, $3
+	ctc2	$t2, $4
+
+	lw		$t0, 0($a1)
+	lw		$t1, 4($a1)
+	mtc2	$t0, C2_IR1
+	lw		$t0, 8($a1)
+	mtc2	$t1, C2_IR2
+	mtc2	$t0, C2_IR3
+
+	nMVMVA(1, 0, 3, 3, 0)
+
+	swc2	C2_IR1, 0($a2)
+	swc2	C2_IR2, 4($a2)
+	swc2	C2_IR3, 8($a2)
+
+	jr		$ra
+	move	$v0, $a2
+	
+\ No newline at end of file
diff --git a/libmeidogte/compmatrixlv.s b/libmeidogte/compmatrixlv.s
new file mode 100644
index 0000000..f613385
--- /dev/null
+++ b/libmeidogte/compmatrixlv.s
@@ -0,0 +1,100 @@
+.set noreorder
+
+.include "gtereg.h"
+.include "inline_s.h"
+
+.set MATRIX_r11r12,	0
+.set MATRIX_r13r21,	4
+.set MATRIX_r22r23,	8
+.set MATRIX_r31r32,	12
+.set MATRIX_r33,	16
+.set MATRIX_trx,	20
+.set MATRIX_try,	24
+.set MATRIX_trz,	28
+
+
+.global CompMatrixLV
+.type CompMatrixLV, @function
+CompMatrixLV:
+
+	# Load matrix v0 to GTE
+	lw		$t0, MATRIX_r11r12($a0)
+	lw		$t1, MATRIX_r13r21($a0)
+	ctc2	$t0, C2_R11R12
+	ctc2	$t1, C2_R13R21
+	lw		$t0, MATRIX_r22r23($a0)
+	lw		$t1, MATRIX_r31r32($a0)
+	lhu		$t2, MATRIX_r33($a0)
+	ctc2	$t0, C2_R22R23
+	lw		$t0, MATRIX_trx($a0)
+	ctc2	$t1, C2_R31R32
+	lw		$t1, MATRIX_try($a0)
+	ctc2	$t2, C2_R33
+	lw		$t2, MATRIX_trz($a0)
+	ctc2	$t0, C2_TRX
+	ctc2	$t1, C2_TRY
+	ctc2	$t2, C2_TRZ
+	
+	lw		$t0, MATRIX_trx($a1)
+	lw		$t1, MATRIX_try($a1)
+	mtc2	$t0, C2_IR1
+	lw		$t0, MATRIX_trz($a1)
+	mtc2	$t1, C2_IR2
+	mtc2	$t0, C2_IR3
+
+	nMVMVA(1, 0, 3, 0, 0)
+
+	swc2	C2_IR1, MATRIX_trx($a2)
+	swc2	C2_IR2, MATRIX_try($a2)
+	swc2	C2_IR3, MATRIX_trz($a2)
+	
+	lhu		$t1, 2*(0+(3*1))($a1)		# Load values for first
+	lhu		$t0, 2*(0+(3*0))($a1)		# R11 R21 R31
+	sll		$t1, 16
+	or		$t0, $t1
+	lhu		$t1, 2*(0+(3*2))($a1)
+	mtc2	$t0, C2_VXY0
+	mtc2	$t1, C2_VZ0
+
+	lhu		$t1, 2*(1+(3*1))($a1)		# Load values for second
+	lhu		$t0, 2*(1+(3*0))($a1)		# R12 R22 R32
+	MVMVA(1, 0, 0, 3, 0)				# First multiply
+	sll		$t1, 16
+	or		$t0, $t1
+	lhu		$t1, 2*(1+(3*2))($a1)
+	mtc2	$t0, C2_VXY0
+	mtc2	$t1, C2_VZ0
+
+	mfc2	$t0, C2_IR1					# Store results of first
+	mfc2	$t1, C2_IR2
+	sh		$t0, 2*(0+(3*0))($a2)
+	mfc2	$t0, C2_IR3
+	sh		$t1, 2*(0+(3*1))($a2)
+	sh		$t0, 2*(0+(3*2))($a2)
+
+	lhu		$t1, 2*(2+(3*1))($a1)		# Load values for third
+	lhu		$t0, 2*(2+(3*0))($a1)		# R13 R23 R33
+	MVMVA(1, 0, 0, 3, 0)				# Second multiply
+	sll		$t1, 16
+	or		$t0, $t1
+	lhu		$t1, 2*(2+(3*2))($a1)
+	mtc2	$t0, C2_VXY0
+	mtc2	$t1, C2_VZ0
+
+	mfc2	$t0, C2_IR1					# Store results of second
+	mfc2	$t1, C2_IR2
+	sh		$t0, 2*(1+(3*0))($a2)
+	mfc2	$t0, C2_IR3
+	sh		$t1, 2*(1+(3*1))($a2)
+	sh		$t0, 2*(1+(3*2))($a2)
+	MVMVA(1, 0, 0, 3, 0)				# Third multiply
+
+	mfc2	$t0, C2_IR1					# Store results of third
+	mfc2	$t1, C2_IR2
+	sh		$t0, 2*(2+(3*0))($a2)
+	mfc2	$t0, C2_IR3
+	sh		$t1, 2*(2+(3*1))($a2)
+	sh		$t0, 2*(2+(3*2))($a2)
+
+	jr		$ra
+	move	$v0, $a2
diff --git a/libmeidogte/gtereg.h b/libmeidogte/gtereg.h
new file mode 100644
index 0000000..5d3391b
--- /dev/null
+++ b/libmeidogte/gtereg.h
@@ -0,0 +1,80 @@
+# GTE register definitions for GNU assembler (as).
+#
+# 2019 Meido-Tek Productions
+
+#
+# GTE data registers (use mfc2, mtc2, lwc2, swc2)
+#
+.set C2_VXY0,	$0
+.set C2_VZ0,	$1
+.set C2_VXY1,	$2
+.set C2_VZ1,	$3
+.set C2_VXY2,	$4
+.set C2_VZ2,	$5
+.set C2_RGB,	$6
+.set C2_OTZ,	$7
+
+.set C2_IR0,	$8
+.set C2_IR1,	$9
+.set C2_IR2,	$10
+.set C2_IR3,	$11
+.set C2_SXY0,	$12
+.set C2_SXY1,	$13
+.set C2_SXY2,	$14
+.set C2_SXYP,	$15
+
+.set C2_SZ0,	$16
+.set C2_SZ1,	$17
+.set C2_SZ2,	$18
+.set C2_SZ3,	$19
+.set C2_RGB0,	$20
+.set C2_RGB1,	$21
+.set C2_RGB2,	$22
+
+.set C2_MAC0,	$24
+.set C2_MAC1,	$25
+.set C2_MAC2,	$26
+.set C2_MAC3,	$27
+.set C2_IRGB,	$28
+.set C2_ORGB,	$29
+.set C2_LZCS,	$30
+.set C2_LZCR,	$31
+
+#
+# GTE control registers (use cfc2/ctc2)
+#
+.set C2_R11R12,	$0
+.set C2_R13R21,	$1
+.set C2_R22R23,	$2
+.set C2_R31R32,	$3
+.set C2_R33,	$4
+.set C2_TRX,	$5
+.set C2_TRY,	$6
+.set C2_TRZ,	$7
+
+.set C2_L11L12,	$8
+.set C2_L13L21,	$9
+.set C2_L22L23,	$10
+.set C2_L31L32,	$11
+.set C2_L33,	$12
+.set C2_RBK,	$13
+.set C2_GBK,	$14
+.set C2_BBK,	$15
+
+.set C2_LR1LR2,	$16
+.set C2_LR3LG1,	$17
+.set C2_LG2LG3,	$18
+.set C2_LB1LB2,	$19
+.set C2_LB3,	$20
+.set C2_RFC,	$21
+.set C2_GFC,	$22
+.set C2_BFC,	$23
+
+.set C2_OFX,	$24
+.set C2_OFY,	$25
+.set C2_H,		$26
+.set C2_DQA,	$27
+.set C2_DQB,	$28
+.set C2_ZSF3,	$29
+.set C2_ZSF4,	$30
+.set C2_FLAG,	$31
diff --git a/libmeidogte/hirotmatrix.c b/libmeidogte/hirotmatrix.c
new file mode 100644
index 0000000..5a252ff
--- /dev/null
+++ b/libmeidogte/hirotmatrix.c
@@ -0,0 +1,35 @@
+#include <meidogte.h>
+
+MATRIX *HiRotMatrix(VECTOR *r, MATRIX *m) {
+
+	short s[3],c[3];
+	MATRIX tm[3];
+
+	s[0] = hisin(r->vx);	s[1] = hisin(r->vy);	s[2] = hisin(r->vz);
+	c[0] = hicos(r->vx);	c[1] = hicos(r->vy);	c[2] = hicos(r->vz);
+
+	// mX
+	m->m[0][0] = ONE;		m->m[0][1] = 0;			m->m[0][2] = 0;
+	m->m[1][0] = 0;			m->m[1][1] = c[0];		m->m[1][2] = -s[0];
+	m->m[2][0] = 0;			m->m[2][1] = s[0];		m->m[2][2] = c[0];
+
+	// mY
+	tm[0].m[0][0] = c[1];	tm[0].m[0][1] = 0;		tm[0].m[0][2] = s[1];
+	tm[0].m[1][0] = 0;		tm[0].m[1][1] = ONE;	tm[0].m[1][2] = 0;
+	tm[0].m[2][0] = -s[1];	tm[0].m[2][1] = 0;		tm[0].m[2][2] = c[1];
+
+	// mZ
+	tm[1].m[0][0] = c[2];	tm[1].m[0][1] = -s[2];	tm[1].m[0][2] = 0;
+	tm[1].m[1][0] = s[2];	tm[1].m[1][1] = c[2];	tm[1].m[1][2] = 0;
+	tm[1].m[2][0] = 0;		tm[1].m[2][1] = 0;		tm[1].m[2][2] = ONE;
+
+	PushMatrix();
+
+	MulMatrix0( m, &tm[0], &tm[2] );
+	MulMatrix0( &tm[2], &tm[1], m );
+
+	PopMatrix();
+
+	return m;
+
+}
diff --git a/libmeidogte/hisin.c b/libmeidogte/hisin.c
new file mode 100644
index 0000000..df03194
--- /dev/null
+++ b/libmeidogte/hisin.c
@@ -0,0 +1,33 @@
+/* Based on isin_S4 implementation from coranac:
+ *	http://www.coranac.com/2009/07/sines/
+ *
+ */
+
+#define qN	15
+#define qA	12
+#define B	19900
+#define	C	3516
+
+int hisin(int x) {
+
+    int c, y;
+
+    c= x<<(30-qN);              // Semi-circle info into carry.
+    x -= 1<<qN;                 // sine -> cosine calc
+
+    x= x<<(31-qN);              // Mask with PI
+    x= x>>(31-qN);              // Note: SIGNED shift! (to qN)
+    x= x*x>>(2*qN-14);          // x=x^2 To Q14
+
+    y= B - (x*C>>14);           // B - x^2*C
+    y= (1<<qA)-(x*y>>16);       // A - x^2*(B-x^2*C)
+
+    return c>=0 ? y : -y;
+
+}
+
+int hicos(int x) {
+
+    return hisin( x+32768 );
+
+}
diff --git a/libmeidogte/initgeom.s b/libmeidogte/initgeom.s
new file mode 100644
index 0000000..14ca293
--- /dev/null
+++ b/libmeidogte/initgeom.s
@@ -0,0 +1,45 @@
+.set noreorder
+
+.include "gtereg.h"
+
+.section .text
+
+
+.global InitGeom
+.type InitGeom, @function
+InitGeom:
+	addiu	$sp, -4
+	sw		$ra, 0($sp)
+
+	jal		EnterCriticalSection
+	nop
+
+	mfc0	$v0, $12				# Get SR
+	lui		$v1, 0x4000				# Set bit to enable cop2
+	or		$v0, $v1
+	mtc0	$v0, $12				# Set new SR
+
+	jal		ExitCriticalSection
+	nop
+
+	ctc2	$0 , $24				# Reset GTE offset
+	ctc2	$0 , $25
+
+	li		$v0, 320				# Set default projection plane
+	ctc2	$v0, $26
+
+	li		$v0, 0x155				# Set ZSF3 and ZSF4 defaults
+	ctc2	$v0, $29
+	li		$v0, 0x100
+	ctc2	$v0, $30
+
+	li		$v0, 0xef9e				# DQA and DQB defaults
+	lui		$v1, 0x0140
+	ctc2	$v0, C2_DQA
+	ctc2	$v1, C2_DQB
+
+	lw		$ra, 0($sp)
+	addiu	$sp, 4
+	jr		$ra
+	nop
+
diff --git a/libmeidogte/inline_s.h b/libmeidogte/inline_s.h
new file mode 100644
index 0000000..08e5c38
--- /dev/null
+++ b/libmeidogte/inline_s.h
@@ -0,0 +1,227 @@
+# Inline GTE macros for GNU assembler (as).
+#
+# 2019 Meido-Tek Productions
+#
+
+.macro nRTPS
+	nop
+	nop
+	cop2 0x0180001
+.endm
+
+.macro nRTPT
+	nop
+	nop
+	cop2 0x0280030
+.endm
+
+.macro nNCLIP
+	nop
+	nop
+	cop2 0x1400006
+.endm
+	
+.macro nAVSZ3
+	nop
+	nop
+	cop2 0x158002D
+.endm
+	
+.macro nAVSZ4
+	nop
+	nop
+	cop2 0x168002E
+.endm
+
+.macro nMVMVA sf mx v cv lm
+	nop
+	nop
+	cop2	0x0400012|(\sf<<19)|(\mx<<17)|(\v<<15)|(\cv<<13)|(\lm<<10)
+.endm
+	
+.macro nSQR sf
+	nop
+	nop
+	cop2	0x0A00428|(\sf<<19)
+.endm
+	
+.macro nnOP sf lm	# extra n to prevent conflict with the nop opcode
+	nop
+	nop
+	cop2	0x170000C|(\sf<<19)|(\lm<<10)
+.endm
+	
+.macro nNCS
+	nop
+	nop
+	cop2	0x0C8041E
+.endm
+	
+.macro nNCT
+	nop
+	nop
+	cop2	0x0D80420
+.endm
+	
+.macro nNCCS
+	nop
+	nop
+	cop2	0x108041B
+.endm
+	
+.macro nNCCT
+	nop
+	nop
+	cop2	0x118043F
+.endm
+	
+.macro nNCDS
+	nop
+	nop
+	cop2	0x0E80413
+.endm
+	
+.macro nNCDT
+	nop
+	nop
+	cop2	0x0F80416
+.endm
+	
+.macro nCC
+	nop
+	nop
+	cop2	0x138041C
+.endm
+	
+.macro nCDP
+	nop
+	nop
+	cop2	0x1280414
+.endm
+	
+.macro nDCPL
+	nop
+	nop
+	cop2	0x0680029
+.endm
+	
+.macro nDPCS
+	nop
+	nop
+	cop2	0x0780010
+.endm
+	
+.macro nDPCT
+	nop
+	nop
+	cop2	0x0180001
+.endm
+
+.macro nINTPL
+	nop
+	nop
+	cop2	0x0980011
+.endm
+
+.macro nGPF sf
+	nop
+	nop
+	cop2	0x190003D|(\sf<<19)
+.endm
+	
+.macro nGPL sf
+	nop
+	nop
+	cop2	0x1A0003E|(\sf<<19)
+.endm
+	
+#
+# Macros without leading nops (for optimized usage)
+#
+.macro RTPS
+	cop2 0x0180001
+.endm
+
+.macro RTPT
+	cop2 0x0280030
+.endm
+
+.macro NCLIP
+	cop2 0x1400006
+.endm
+	
+.macro AVSZ3
+	cop2 0x158002D
+.endm
+	
+.macro AVSZ4
+	cop2 0x168002E
+.endm
+
+.macro MVMVA sf mx v cv lm
+	cop2	0x0400012|(\sf<<19)|(\mx<<17)|(\v<<15)|(\cv<<13)|(\lm<<10)
+.endm
+	
+.macro SQR sf
+	cop2	0x0A00428|(\sf<<19)
+.endm
+	
+.macro OP sf lm
+	cop2	0x170000C|(\sf<<19)|(\lm<<10)
+.endm
+	
+.macro NCS
+	cop2	0x0C8041E
+.endm
+	
+.macro NCT
+	cop2	0x0D80420
+.endm
+	
+.macro NCCS
+	cop2	0x108041B
+.endm
+	
+.macro NCCT
+	cop2	0x118043F
+.endm
+	
+.macro NCDS
+	cop2	0x0E80413
+.endm
+	
+.macro NCDT
+	cop2	0x0F80416
+.endm
+	
+.macro CC
+	cop2	0x138041C
+.endm
+	
+.macro CDP
+	cop2	0x1280414
+.endm
+	
+.macro DCPL
+	cop2	0x0680029
+.endm
+	
+.macro DPCS
+	cop2	0x0780010
+.endm
+	
+.macro DPCT
+	cop2	0x0180001
+.endm
+
+.macro INTPL
+	cop2	0x0980011
+.endm
+
+.macro GPF sf
+	cop2	0x190003D|(\sf<<19)
+.endm
+
+.macro GPL sf
+	cop2	0x1A0003E|(\sf<<19)
+.endm
diff --git a/libmeidogte/isin.c b/libmeidogte/isin.c
new file mode 100644
index 0000000..3641efd
--- /dev/null
+++ b/libmeidogte/isin.c
@@ -0,0 +1,34 @@
+/* Based on isin_S4 implementation from coranac:
+ *	http://www.coranac.com/2009/07/sines/
+ *
+ */
+
+#define qN	10
+#define qA	12
+#define B	19900
+#define	C	3516
+
+int isin(int x) {
+
+    int c, y;
+
+    c= x<<(30-qN);              // Semi-circle info into carry.
+    x -= 1<<qN;                 // sine -> cosine calc
+
+    x= x<<(31-qN);              // Mask with PI
+    x= x>>(31-qN);              // Note: SIGNED shift! (to qN)
+
+    x= x*x>>(2*qN-14);          // x=x^2 To Q14
+
+    y= B - (x*C>>14);           // B - x^2*C
+    y= (1<<qA)-(x*y>>16);       // A - x^2*(B-x^2*C)
+
+    return c>=0 ? y : -y;
+
+}
+
+int icos(int x) {
+
+    return isin( x+1024 );
+
+}
diff --git a/libmeidogte/matrix.c b/libmeidogte/matrix.c
new file mode 100644
index 0000000..1c226e1
--- /dev/null
+++ b/libmeidogte/matrix.c
@@ -0,0 +1,45 @@
+#include <meidogte.h>
+
+MATRIX *RotMatrix(SVECTOR *r, MATRIX *m) {
+
+	short s[3],c[3];
+	MATRIX tm[3];
+
+	s[0] = isin(r->vx);		s[1] = isin(r->vy);		s[2] = isin(r->vz);
+	c[0] = icos(r->vx);		c[1] = icos(r->vy);		c[2] = icos(r->vz);
+
+	// mX
+	m->m[0][0] = ONE;		m->m[0][1] = 0;			m->m[0][2] = 0;
+	m->m[1][0] = 0;			m->m[1][1] = c[0];		m->m[1][2] = -s[0];
+	m->m[2][0] = 0;			m->m[2][1] = s[0];		m->m[2][2] = c[0];
+
+	// mY
+	tm[0].m[0][0] = c[1];	tm[0].m[0][1] = 0;		tm[0].m[0][2] = s[1];
+	tm[0].m[1][0] = 0;		tm[0].m[1][1] = ONE;	tm[0].m[1][2] = 0;
+	tm[0].m[2][0] = -s[1];	tm[0].m[2][1] = 0;		tm[0].m[2][2] = c[1];
+
+	// mZ
+	tm[1].m[0][0] = c[2];	tm[1].m[0][1] = -s[2];	tm[1].m[0][2] = 0;
+	tm[1].m[1][0] = s[2];	tm[1].m[1][1] = c[2];	tm[1].m[1][2] = 0;
+	tm[1].m[2][0] = 0;		tm[1].m[2][1] = 0;		tm[1].m[2][2] = ONE;
+
+	PushMatrix();
+
+	MulMatrix0( m, &tm[0], &tm[2] );
+	MulMatrix0( &tm[2], &tm[1], m );
+
+	PopMatrix();
+
+	return m;
+
+}
+
+MATRIX *TransMatrix(MATRIX *m, VECTOR *r) {
+
+	m->t[0] = r->vx;
+	m->t[1] = r->vy;
+	m->t[2] = r->vz;
+
+	return m;
+
+}
diff --git a/libmeidogte/meidogte.h b/libmeidogte/meidogte.h
new file mode 100644
index 0000000..3953701
--- /dev/null
+++ b/libmeidogte/meidogte.h
@@ -0,0 +1,170 @@
+#ifndef _MEIDOGTE_H
+#define _MEIDOGTE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <meidogte_inline.h>
+
+/**
+ * One degree = 4096
+ */
+#define ONE		4096
+
+
+typedef struct {
+	short	m[3][3];
+	int		t[3];
+} MATRIX;
+
+typedef struct {
+	int		vx, vy, vz;
+} VECTOR;
+
+typedef struct {
+	short	vx, vy, vz, pad;
+} SVECTOR;
+
+typedef struct {
+	unsigned char r, g, b, cd;
+} CVECTOR;
+
+/**
+ * Initialize MeidoGTE library
+ */
+
+void InitGeom();
+
+/**
+ * Integer sine function (4096 = 360 degrees)
+ * @param a Input
+ * @return Sine of input
+ */
+int isin(int a);
+
+/**
+ * Integer cosine function (4096 = 360 degrees)
+ * @param a Input
+ * @return Cosine of input
+ */
+int icos(int a);
+
+/**
+ * Higher precision integer sine function (131072 = 360 degrees)
+ * @param a Input
+ * @return Sine of input
+ */
+int hisin(int a);
+/**
+ * Higher precision integer cosine function (131072 = 360 degrees)
+ * @param a Input
+ * @return Cosine of input
+ */
+int hicos(int a);
+
+/**
+ * Save a constant rotation matrix in stack.
+ */
+void PushMatrix();
+
+/**
+ * Reset a constant rotation matrix from stack.
+ */
+void PopMatrix();
+
+/**
+ * Find rotation matrix from a rotation angle. (4096 = 360 degrees)
+ * @param r Rotation angle (input)
+ * @param m Rotation matrix (output)
+ * @return Pointer to m
+ */
+ 
+MATRIX *RotMatrix(SVECTOR *r, MATRIX *m);
+
+/**
+ * Find rotation matrix from a rotation angle. (high-precision) (131072 = 360 degrees)
+ * @param r Rotation angle (input)
+ * @param m Rotation matrix (output)
+ * @return Pointer to m
+ */
+MATRIX *HiRotMatrix(VECTOR *r, MATRIX *m);
+
+/**
+ * Give an amount of parallel transfer expressed by v to the matrix m.
+ * @param m Pointer to matrix (output)
+ * @param v Pointer to transfer vector (input)
+ * @return Pointer to m
+ */
+MATRIX *TransMatrix(MATRIX *m, VECTOR *r);
+/**
+ * Scale m by v.
+ * @param m Pointer to matrix (output)
+ * @param v Pointer to scale vector (input)
+ * @return Pointer to m
+ */
+MATRIX *ScaleMatrix(MATRIX *m, VECTOR *s);
+
+/**
+ * Multiply two matrices.
+ * @param m0 First matrix (result is saved here)
+ * @param m1 Second matrix
+ * @return Pointer to m0.
+ */
+MATRIX *MulMatrix(MATRIX *m0, MATRIX *m1);
+/**
+ * Multiply two matrices.
+ * @param m0 First matrix
+ * @param m1 Second matrix
+ * @param m2 Output matrix
+ * @return Pointer to m2
+ */
+MATRIX *MulMatrix0(MATRIX *m0, MATRIX *m1, MATRIX *m2);
+/**
+ * Make a composite coordinate transformation matrix.
+ * @param m0 First matrix
+ * @param m1 Second matrix
+ * @param m2 Output matrix
+ * @return Pointer to m2
+ */
+MATRIX *CompMatrixLV(MATRIX *v0, MATRIX *v1, MATRIX *v2);
+/**
+ * Multiply a vector by a matrix.
+ * @param m Pointer to matrix to be multiplied
+ * @param v0 Pointer to vector (input)
+ * @param v1 Pointer to vector (output)
+ * @return Pointer to v1
+ */
+VECTOR *ApplyMatrixLV(MATRIX *m, VECTOR *v0, VECTOR *v1);
+/**
+ * Normalize a vector.
+ * Warning: if ((v0->vx)^2 + (v1->vx)^2 +(v2->vx)^2) > 0x7FFFFFF,
+ * a processor exception will occur.
+ * @param v0 Pointer to vector (input)
+ * @param v1 Pointer to vector (output)
+ */
+void VectorNormalS(VECTOR *v0, SVECTOR *v1);
+/**
+ * Return a vector, obtained by squaring each term of the vector v0, to v1.
+ * @param v0 Pointer to vector (input)
+ * @param v1 Pointer to vector (output)
+ */
+void Square0(VECTOR *v0, VECTOR *v1);
+/**
+ * Square root
+ * @param a Input value
+ * @return Square root of input value
+ */
+int SquareRoot0(int a);
+/**
+ * Square root
+ * @param a Input value in (0, 20, 12) format
+ * @return Square root of input value in (0, 20, 12) format
+ */
+int SquareRoot12(int a);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _MEIDOGTE_H
diff --git a/libmeidogte/meidogte_inline.h b/libmeidogte/meidogte_inline.h
new file mode 100644
index 0000000..ab03702
--- /dev/null
+++ b/libmeidogte/meidogte_inline.h
@@ -0,0 +1,433 @@
+/* Inline GTE macros for the GNU C compiler.
+ *
+ * 2019 Meido-Tek Production
+ *
+ *
+ *
+ * Todo: A couple of GTE operation macros are still missing such as
+ *  gte_rtv*() though they appear to be just variants of gte_mvmva more or
+ *  less (gte_rtv0() is actually gte_mvmva(1, 0, 0, 3, 0) for example).
+ *
+ */
+
+#ifndef _MEIDOGTE_INLINE_C_H
+#define _MEIDOGTE_INLINE_C_H
+
+/**
+ *	GTE load macros
+ */
+
+/**
+ * Load a SVECTOR (passed as a pointer) to GTE V0
+ */
+#define gte_ldv0( r0 ) __asm__ volatile ( \
+	"lwc2	$0 , 0( %0 );"	\
+	"lwc2	$1 , 4( %0 );"	\
+	:						\
+	: "r"( r0 )				\
+	: "$t0" )
+
+/** 
+ * Load a SVECTOR (passed as a pointer) to GTE V1
+ */
+#define gte_ldv1( r0 ) __asm__ volatile ( \
+	"lwc2	$2 , 0( %0 );"	\
+	"lwc2	$3 , 4( %0 );"	\
+	:						\
+	: "r"( r0 )				\
+	: "$t0" )
+
+/**
+ * Load a SVECTOR (passed as a pointer) to GTE V2
+ */
+#define gte_ldv2( r0 ) __asm__ volatile ( \
+	"lwc2	$4 , 0( %0 );"	\
+	"lwc2	$5 , 4( %0 );"	\
+	:						\
+	: "r"( r0 )				\
+	: "$t0" )
+
+/**
+ * Load three SVECTORs (passed as a pointer) to the GTE at once
+ */
+#define gte_ldv3( r0, r1, r2 ) __asm__ volatile ( \
+	"lwc2	$0 , 0( %0 );"	\
+	"lwc2	$1 , 4( %0 );"	\
+	"lwc2	$2 , 0( %1 );"	\
+	"lwc2	$3 , 4( %1 );"	\
+	"lwc2	$4 , 0( %2 );"	\
+	"lwc2	$5 , 4( %2 );"	\
+	:						\
+	: "r"( r0 ), "r"( r1 ), "r"( r2 ) )
+	
+#define gte_ldrgb( r0 ) __asm__ volatile ( \
+	"lwc2	$6 , 0( %0 );"	\
+	:						\
+	: "r"( r0 ) )
+	
+#define gte_ldopv2( r0 ) __asm__ volatile ( \
+	"lwc2	$11, 8( %0 );"	\
+	"lwc2	$9 , 0( %0 );"	\
+	"lwc2	$10, 4( %0 );"	\
+	:						\
+	: "r"( r0 ) )
+	
+/**
+ * Sets the GTE offset
+ */
+#define gte_SetGeomOffset( r0, r1 ) __asm__ volatile ( \
+	"sll	$t0, %0, 16;"	\
+	"sll	$t1, %1, 16;"	\
+	"ctc2 	$t0, $24;"		\
+	"ctc2	$t1, $25;"		\
+	:						\
+	: "r"( r0 ), "r"( r1 )	\
+	: "$t0", "$t1" )
+	
+#define gte_SetGeomScreen( r0 ) __asm__ volatile ( \
+	"ctc2	%0, $26;"		\
+	:						\
+	: "r"( r0 ) )
+
+#define gte_SetTransMatrix( r0 ) __asm__ volatile ( \
+	"lw		$t0, 20( %0 );"	\
+	"lw		$t1, 24( %0 );"	\
+	"ctc2	$t0, $5;"		\
+	"lw		$t2, 28( %0 );"	\
+	"ctc2	$t1, $6;"		\
+	"ctc2	$t2, $7;"		\
+	:						\
+	: "r"( r0 )				\
+	: "$t2" )
+	
+#define gte_SetRotMatrix( r0 ) __asm__ volatile ( \
+	"lw		$t0, 0( %0 );"	\
+	"lw		$t1, 4( %0 );"	\
+	"ctc2	$t0, $0;"		\
+	"ctc2	$t1, $1;"		\
+	"lw		$t0, 8( %0 );"	\
+	"lw		$t1, 12( %0 );"	\
+	"lhu	$t2, 16( %0 );"	\
+	"ctc2	$t0, $2;"		\
+	"ctc2	$t1, $3;"		\
+	"ctc2	$t2, $4;"		\
+	:						\
+	: "r"( r0 )				\
+	: "$t2" )
+
+#define gte_SetLightMatrix( r0 ) __asm__ volatile ( \
+	"lw		$t0, 0( %0 );"	\
+	"lw		$t1, 4( %0 );"	\
+	"ctc2	$t0, $8;"		\
+	"ctc2	$t1, $9;"		\
+	"lw		$t0, 8( %0 );"	\
+	"lw		$t1, 12( %0 );"	\
+	"lhu	$t2, 16( %0 );"	\
+	"ctc2	$t0, $10;"		\
+	"ctc2	$t1, $11;"		\
+	"ctc2	$t2, $12;"		\
+	:						\
+	: "r"( r0 )				\
+	: "$t2" )
+	
+#define gte_SetColorMatrix( r0 ) __asm__ volatile ( \
+	"lw		$t0, 0( %0 );"	\
+	"lw		$t1, 4( %0 );"	\
+	"ctc2	$t0, $16;"		\
+	"ctc2	$t1, $17;"		\
+	"lw		$t0, 8( %0 );"	\
+	"lw		$t1, 12( %0 );"	\
+	"lhu	$t2, 16( %0 );"	\
+	"ctc2	$t0, $18;"		\
+	"ctc2	$t1, $19;"		\
+	"ctc2	$t2, $20;"		\
+	:						\
+	: "r"( r0 )				\
+	: "$t2" )
+	
+#define gte_SetBackColor( r0, r1, r2 ) __asm__ volatile ( \
+	"sll	$t0, %0, 4;"	\
+	"sll	$t1, %1, 4;"	\
+	"sll	$t2, %2, 4;"	\
+	"ctc2	$t0, $13;"		\
+	"ctc2	$t1, $14;"		\
+	"ctc2	$t2, $15;"		\
+	:						\
+	: "r"( r0 ), "r"( r1 ), "r"( r2 )	\
+	: "$t0", "$t1", "$t2" )
+	
+/**
+ *	GTE store macros
+ */
+	
+#define gte_otz( r0 ) __asm__ volatile ( \
+	"swc2	$7, 0( %0 );"	\
+	:						\
+	: "r"( r0 )				\
+	: "memory" )
+	
+#define gte_stflg( r0 ) __asm__ volatile ( \
+	"cfc2	$t0, $31;"		\
+	"nop;"					\
+	"sw		$t0, 0( %0 );"	\
+	:						\
+	: "r"( r0 )				\
+	: "memory" )
+	
+#define gte_stsxy( r0 ) __asm__ volatile ( \
+	"swc2	$14, 0( %0 );"	\
+	:						\
+	: "r"( r0 )				\
+	: "memory" )
+	
+#define gte_stsxy0( r0 ) __asm__ volatile ( \
+	"swc2	$12, 0( %0 );"	\
+	:						\
+	: "r"( r0 )				\
+	: "memory" )
+
+#define gte_stsxy1( r0 ) __asm__ volatile ( \
+	"swc2	$13, 0( %0 );"	\
+	:						\
+	: "r"( r0 )				\
+	: "memory" )
+
+#define gte_stsxy2( r0 ) __asm__ volatile ( \
+	"swc2	$14, 0( %0 );"	\
+	:						\
+	: "r"( r0 )				\
+	: "memory" )
+
+#define gte_stsxy3( r0, r1, r2 ) __asm__ volatile ( \
+	"swc2	$12, 0( %0 );"	\
+	"swc2	$13, 0( %1 );"	\
+	"swc2	$14, 0( %2 );"	\
+	:						\
+	: "r"( r0 ), "r"( r1 ), "r"( r2 ) \
+	: "memory" )
+
+#define gte_stotz( r0 ) __asm__ volatile ( \
+	"swc2	$7, 0( %0 );"	\
+	:						\
+	: "r"( r0 ) 			\
+	: "memory" )
+	
+#define gte_stopz( r0 ) __asm__ volatile ( \
+	"swc2	$24, 0( %0 );"	\
+	:						\
+	: "r"( r0 )				\
+	: "memory" )
+	
+#define gte_strgb( r0 ) __asm__ volatile ( \
+	"swc2	$22, 0( %0 );"	\
+	:						\
+	: "r"( r0 )				\
+	: "memory" )
+	
+#define gte_strgb3( r0, r1, r2 ) __asm__ volatile ( \
+	"swc2	$20, 0( %0 );"	\
+	"swc2	$21, 0( %1 );"	\
+	"swc2	$22, 0( %2 );"	\
+	:						\
+	: "r"( r0 ), "r"( r1 ), "r" ( r2 )	\
+	: "memory" )
+
+#define gte_stsv( r0 ) __asm__ volatile ( \
+	"mfc2	$t0, $9;"		\
+	"mfc2	$t1, $10;"		\
+	"mfc2	$t2, $11;"		\
+	"sh		$t0, 0( %0 );"	\
+	"sh		$t1, 2( %0 );"	\
+	"sh		$t2, 4( %0 );"	\
+	:						\
+	: "r"( r0 )				\
+	: "memory" )
+	
+#define gte_stlvnl( r0 ) __asm__ volatile ( \
+	"swc2	$25, 0( %0 );"	\
+	"swc2	$26, 4( %0 );"	\
+	"swc2	$27, 8( %0 );"	\
+	:						\
+	: "r"( r0 )				\
+	: "memory" )
+	
+	
+/**
+ *	GTE operation macros
+ */
+ 
+#define gte_rtps() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2 0x0180001;" )
+
+#define gte_rtpt() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2 0x0280030;" )
+	
+#define gte_nclip() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2 0x1400006;" )
+	
+#define gte_avsz3() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2 0x158002D;" )
+	
+#define gte_avsz4() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2 0x168002E;" )
+	
+#define gte_sqr0() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x0A00428;" )
+	
+#define gte_sqr12() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x0A80428;" )
+	
+#define gte_op0() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x170000C;"	)
+
+#define gte_op12() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x178000C;"	)
+	
+#define gte_ncs() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x0C8041E;" )
+	
+#define gte_nct() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x0D80420;" )
+	
+#define gte_nccs() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x108041B;" )	\
+	
+#define gte_ncct() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x118043F;"	)
+	
+#define gte_ncds() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x0E80413;"	)
+	
+#define gte_ncdt() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x0F80416;" )
+	
+#define gte_cc() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x138041C;" )
+	
+#define gte_cdp() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x1280414;" )
+	
+#define gte_dcpl() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x0680029;"	)
+	
+#define gte_dpcs() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x0780010;" )
+	
+#define gte_dpct() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x0180001;" )
+
+#define gte_intpl() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x0980011;" )
+
+#define gte_gpf0() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x190003D;"	)
+
+#define gte_gpf12() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x198003D;"	)
+	
+#define gte_gpl0() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x1A0003E;" )
+
+#define gte_gpl12() __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2	0x1A8003E;" )
+
+#define gte_mvmva_core( r0 ) __asm__ volatile ( \
+	"nop;"					\
+	"nop;"					\
+	"cop2 %0"				\
+	:						\
+	: "g"( r0 ) )
+	
+#define gte_mvmva(sf, mx, v, cv, lm) gte_mvmva_core( 0x0400012 | \
+	((sf)<<19) | ((mx)<<17) | ((v)<<15) | ((cv)<<13) | ((lm)<<10) )
+	
+	
+/**
+ *	GTE operation macros without leading nops
+ *
+ *	Checking assembler output when using these is advised.
+ */
+ 
+#define gte_rtps_b()	__asm__ volatile ( "cop2 0x0180001;" )
+#define gte_rtpt_b()	__asm__ volatile ( "cop2 0x0280030;" )
+#define gte_nclip_b()	__asm__ volatile ( "cop2 0x1400006;" )
+#define gte_avsz3_b()	__asm__ volatile ( "cop2 0x158002D;" )
+#define gte_avsz4_b()	__asm__ volatile ( "cop2 0x168002E;" )
+#define gte_sqr0_b()	__asm__ volatile ( "cop2 0x0A00428;" )
+#define gte_sqr12_b()	__asm__ volatile ( "cop2 0x0A80428;" )
+#define gte_op0_b()		__asm__ volatile ( "cop2 0x170000C;" )
+#define gte_op12_b()	__asm__ volatile ( "cop2 0x178000C;" )
+#define gte_ncs_b()		__asm__ volatile ( "cop2 0x0C8041E;" )
+#define gte_nct_b()		__asm__ volatile ( "cop2 0x0D80420;" )
+#define gte_nccs_b()	__asm__ volatile ( "cop2 0x108041B;" )
+#define gte_ncct_b()	__asm__ volatile ( "cop2 0x118043F;" )
+#define gte_ncds_b()	__asm__ volatile ( "cop2 0x0E80413;" )
+#define gte_ncdt_b()	__asm__ volatile ( "cop2 0x0F80416;" )
+#define gte_cc_b()		__asm__ volatile ( "cop2 0x138041C;" )
+#define gte_cdp_b()		__asm__ volatile ( "cop2 0x1280414;" )
+#define gte_dcpl_b()	__asm__ volatile ( "cop2 0x0680029;" )
+#define gte_dpcs_b()	__asm__ volatile ( "cop2 0x0780010;" )
+#define gte_dpct_b()	__asm__ volatile ( "cop2 0x0180001;" )
+#define gte_intpl_b()	__asm__ volatile ( "cop2 0x0980011;" )
+#define gte_gpf0_b()	__asm__ volatile ( "cop2 0x190003D;" )
+#define gte_gpf12_b()	__asm__ volatile ( "cop2 0x198003D;" )
+#define gte_gpl0_b()	__asm__ volatile ( "cop2 0x1A0003E;" )
+#define gte_gpl12_b()	__asm__ volatile ( "cop2 0x1A8003E;" )
+#define gte_mvmva_core_b( r0 ) __asm__ volatile ( \
+	"cop2 %0"				\
+	:						\
+	: "g"( r0 ) )	
+#define gte_mvmva_b(sf, mx, v, cv, lm) gte_mvmva_core_b( 0x0400012 | \
+	((sf)<<19) | ((mx)<<17) | ((v)<<15) | ((cv)<<13) | ((lm)<<10) )
+	
+#endif // _MEIDOGTE_INLINE_C_H
+\ No newline at end of file
diff --git a/libmeidogte/mulmatrix.s b/libmeidogte/mulmatrix.s
new file mode 100644
index 0000000..19dabe8
--- /dev/null
+++ b/libmeidogte/mulmatrix.s
@@ -0,0 +1,74 @@
+.set noreorder
+
+.include "gtereg.h"
+.include "inline_s.h"
+
+.section .text
+
+
+.global MulMatrix
+.type MulMatrix, @function
+MulMatrix:
+
+	# Load m1 to GTE
+	lw		$t0, 0($a1)
+	lw		$t1, 4($a1)
+	ctc2	$t0, $0
+	ctc2	$t1, $1
+	lw		$t0, 8($a1)
+	lw		$t1, 12($a1)
+	lhu		$t2, 16($a1)
+	ctc2	$t0, $2
+	ctc2	$t1, $3
+	ctc2	$t2, $4
+
+	lhu		$t1, 2*(0+(3*1))($a0)		# Load values for first
+	lhu		$t0, 2*(0+(3*0))($a0)		# R11 R21 R31
+	sll		$t1, 16
+	or		$t0, $t1
+	lhu		$t1, 2*(0+(3*2))($a0)
+	mtc2	$t0, C2_VXY0
+	mtc2	$t1, C2_VZ0
+
+	lhu		$t1, 2*(1+(3*1))($a0)		# Load values for second
+	lhu		$t0, 2*(1+(3*0))($a0)		# R12 R22 R32
+	MVMVA(1, 0, 0, 3, 0)				# First multiply
+	sll		$t1, 16
+	or		$t0, $t1
+	lhu		$t1, 2*(1+(3*2))($a0)
+	mtc2	$t0, C2_VXY0
+	mtc2	$t1, C2_VZ0
+
+	mfc2	$t0, C2_IR1					# Store results of first
+	mfc2	$t1, C2_IR2
+	sh		$t0, 2*(0+(3*0))($a0)
+	mfc2	$t0, C2_IR3
+	sh		$t1, 2*(0+(3*1))($a0)
+	sh		$t0, 2*(0+(3*2))($a0)
+
+	lhu		$t1, 2*(2+(3*1))($a0)		# Load values for third
+	lhu		$t0, 2*(2+(3*0))($a0)		# R13 R23 R33
+	MVMVA(1, 0, 0, 3, 0)				# Second multiply
+	sll		$t1, 16
+	or		$t0, $t1
+	lhu		$t1, 2*(2+(3*2))($a0)
+	mtc2	$t0, C2_VXY0
+	mtc2	$t1, C2_VZ0
+
+	mfc2	$t0, C2_IR1					# Store results of second
+	mfc2	$t1, C2_IR2
+	sh		$t0, 2*(1+(3*0))($a0)
+	mfc2	$t0, C2_IR3
+	sh		$t1, 2*(1+(3*1))($a0)
+	sh		$t0, 2*(1+(3*2))($a0)
+	MVMVA(1, 0, 0, 3, 0)				# Third multiply
+
+	mfc2	$t0, C2_IR1					# Store results of third
+	mfc2	$t1, C2_IR2
+	sh		$t0, 2*(2+(3*0))($a0)
+	mfc2	$t0, C2_IR3
+	sh		$t1, 2*(2+(3*1))($a0)
+	sh		$t0, 2*(2+(3*2))($a0)
+
+	jr		$ra
+	move	$v0, $a0
diff --git a/libmeidogte/mulmatrix0.s b/libmeidogte/mulmatrix0.s
new file mode 100644
index 0000000..874226b
--- /dev/null
+++ b/libmeidogte/mulmatrix0.s
@@ -0,0 +1,74 @@
+.set noreorder
+
+.include "gtereg.h"
+.include "inline_s.h"
+
+.section .text
+
+
+.global MulMatrix0
+.type MulMatrix0, @function
+MulMatrix0:
+
+	# Load m1 to GTE
+	lw		$t0, 0($a0)
+	lw		$t1, 4($a0)
+	ctc2	$t0, $0
+	ctc2	$t1, $1
+	lw		$t0, 8($a0)
+	lw		$t1, 12($a0)
+	lhu		$t2, 16($a0)
+	ctc2	$t0, $2
+	ctc2	$t1, $3
+	ctc2	$t2, $4
+
+	lhu		$t1, 2*(0+(3*1))($a1)		# Load values for first
+	lhu		$t0, 2*(0+(3*0))($a1)		# R11 R21 R31
+	sll		$t1, 16
+	or		$t0, $t1
+	lhu		$t1, 2*(0+(3*2))($a1)
+	mtc2	$t0, C2_VXY0
+	mtc2	$t1, C2_VZ0
+
+	lhu		$t1, 2*(1+(3*1))($a1)		# Load values for second
+	lhu		$t0, 2*(1+(3*0))($a1)		# R12 R22 R32
+	MVMVA(1, 0, 0, 3, 0)				# First multiply
+	sll		$t1, 16
+	or		$t0, $t1
+	lhu		$t1, 2*(1+(3*2))($a1)
+	mtc2	$t0, C2_VXY0
+	mtc2	$t1, C2_VZ0
+
+	mfc2	$t0, C2_IR1					# Store results of first
+	mfc2	$t1, C2_IR2
+	sh		$t0, 2*(0+(3*0))($a2)
+	mfc2	$t0, C2_IR3
+	sh		$t1, 2*(0+(3*1))($a2)
+	sh		$t0, 2*(0+(3*2))($a2)
+
+	lhu		$t1, 2*(2+(3*1))($a1)		# Load values for third
+	lhu		$t0, 2*(2+(3*0))($a1)		# R13 R23 R33
+	MVMVA(1, 0, 0, 3, 0)				# Second multiply
+	sll		$t1, 16
+	or		$t0, $t1
+	lhu		$t1, 2*(2+(3*2))($a1)
+	mtc2	$t0, C2_VXY0
+	mtc2	$t1, C2_VZ0
+
+	mfc2	$t0, C2_IR1					# Store results of second
+	mfc2	$t1, C2_IR2
+	sh		$t0, 2*(1+(3*0))($a2)
+	mfc2	$t0, C2_IR3
+	sh		$t1, 2*(1+(3*1))($a2)
+	sh		$t0, 2*(1+(3*2))($a2)
+	MVMVA(1, 0, 0, 3, 0)				# Third multiply
+
+	mfc2	$t0, C2_IR1					# Store results of third
+	mfc2	$t1, C2_IR2
+	sh		$t0, 2*(2+(3*0))($a2)
+	mfc2	$t0, C2_IR3
+	sh		$t1, 2*(2+(3*1))($a2)
+	sh		$t0, 2*(2+(3*2))($a2)
+
+	jr		$ra
+	move	$v0, $a2
diff --git a/libmeidogte/pushpopmatrix.s b/libmeidogte/pushpopmatrix.s
new file mode 100644
index 0000000..d10687a
--- /dev/null
+++ b/libmeidogte/pushpopmatrix.s
@@ -0,0 +1,68 @@
+.set noreorder
+
+.include "gtereg.h"
+.include "inline_s.h"
+
+.section .text
+
+
+.global PushMatrix
+.type PushMatrix, @function
+PushMatrix:
+	la		$a0, _matrix_stack
+	cfc2	$v0, C2_R11R12
+	cfc2	$v1, C2_R13R21
+	sw		$v0, 0($a0)
+	cfc2	$v0, C2_R22R23
+	sw		$v1, 4($a0)
+	sw		$v0, 8($a0)
+	cfc2	$v0, C2_R31R32
+	cfc2	$v1, C2_R33
+	sw		$v0, 12($a0)
+	sw		$v1, 16($a0)
+	cfc2	$v0, C2_TRX
+	cfc2	$v1, C2_TRY
+	sw		$v0, 20($a0)
+	cfc2	$v0, C2_TRZ
+	sw		$v1, 24($a0)
+	jr		$ra
+	sw		$v0, 28($a0)
+
+.global PopMatrix
+.type PopMatrix, @function
+PopMatrix:
+	la		$a0, _matrix_stack
+	lw		$v0, 0($a0)
+	lw		$v1, 4($a0)
+	ctc2	$v0, C2_R11R12
+	ctc2	$v1, C2_R13R21
+	lw		$v0, 8($a0)
+	lw		$v1, 12($a0)
+	ctc2	$v0, C2_R22R23
+	lw		$v0, 16($a0)
+	ctc2	$v1, C2_R31R32
+	ctc2	$v0, C2_R33
+	lw		$v0, 20($a0)
+	lw		$v1, 24($a0)
+	ctc2	$v0, C2_TRX
+	lw		$v0, 28($a0)
+	ctc2	$v1, C2_TRY
+	ctc2	$v0, C2_TRZ
+	jr		$ra
+	nop
+
+
+.section .data
+
+
+.type matrix_stack, @object
+_matrix_stack:
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+	.word 0
+
diff --git a/libmeidogte/scalematrix.s b/libmeidogte/scalematrix.s
new file mode 100644
index 0000000..3e83800
--- /dev/null
+++ b/libmeidogte/scalematrix.s
@@ -0,0 +1,68 @@
+.set noreorder
+
+.include "gtereg.h"
+.include "inline_s.h"
+
+.section .text
+
+
+.global ScaleMatrix
+.type ScaleMatrix, @function
+ScaleMatrix:
+
+	lwc2	C2_IR0,	0($a1)			# X
+
+	lh		$v0, 2*(0+(3*0))($a0)
+	lh		$v1, 2*(0+(3*1))($a0)
+	mtc2	$v0, C2_IR1
+	lh		$v0, 2*(0+(3*2))($a0)
+	mtc2	$v1, C2_IR2
+	mtc2	$v0, C2_IR3
+
+	nGPF(1)
+
+	mfc2	$v0, C2_IR1
+	mfc2	$v1, C2_IR2
+	sh		$v0, 2*(0+(3*0))($a0)
+	mfc2	$v0, C2_IR3
+	sh		$v1, 2*(0+(3*1))($a0)
+	sh		$v0, 2*(0+(3*2))($a0)
+
+	lwc2	C2_IR0,	4($a1)			# Y
+
+	lh		$v0, 2*(1+(3*0))($a0)
+	lh		$v1, 2*(1+(3*1))($a0)
+	mtc2	$v0, C2_IR1
+	lh		$v0, 2*(1+(3*2))($a0)
+	mtc2	$v1, C2_IR2
+	mtc2	$v0, C2_IR3
+
+	nGPF(1)
+
+	mfc2	$v0, C2_IR1
+	mfc2	$v1, C2_IR2
+	sh		$v0, 2*(1+(3*0))($a0)
+	mfc2	$v0, C2_IR3
+	sh		$v1, 2*(1+(3*1))($a0)
+	sh		$v0, 2*(1+(3*2))($a0)
+
+	lwc2	C2_IR0,	8($a1)			# Z
+
+	lh		$v0, 2*(2+(3*0))($a0)
+	lh		$v1, 2*(2+(3*1))($a0)
+	mtc2	$v0, C2_IR1
+	lh		$v0, 2*(2+(3*2))($a0)
+	mtc2	$v1, C2_IR2
+	mtc2	$v0, C2_IR3
+
+	nGPF(1)
+
+	mfc2	$v0, C2_IR1
+	mfc2	$v1, C2_IR2
+	sh		$v0, 2*(2+(3*0))($a0)
+	mfc2	$v0, C2_IR3
+	sh		$v1, 2*(2+(3*1))($a0)
+	sh		$v0, 2*(2+(3*2))($a0)
+
+	jr		$ra
+	move	$v0, $a0
diff --git a/libmeidogte/square0.s b/libmeidogte/square0.s
new file mode 100644
index 0000000..d037b7e
--- /dev/null
+++ b/libmeidogte/square0.s
@@ -0,0 +1,27 @@
+.set noreorder
+
+.include "gtereg.h"
+.include "inline_s.h"
+
+.section .text
+
+
+.global Square0
+.type Square0, @function
+Square0:
+
+	# a0 - Pointer to input vector (v0)
+	# a1 - Pointer to output vector (v1)
+
+	lwc2	C2_IR1, 0($a0)
+	lwc2	C2_IR2, 4($a0)
+	lwc2	C2_IR3, 8($a0)
+
+	nSQR(0)
+
+	swc2	C2_IR1, 0($a1)
+	swc2	C2_IR2, 4($a1)
+	swc2	C2_IR3, 8($a1)
+
+	jr		$ra
+	nop
diff --git a/libmeidogte/squareroot.s b/libmeidogte/squareroot.s
new file mode 100644
index 0000000..af095a2
--- /dev/null
+++ b/libmeidogte/squareroot.s
@@ -0,0 +1,121 @@
+.set noreorder
+
+.include "gtereg.h"
+.include "inline_s.h"
+
+.section .text
+
+.global SquareRoot12
+.type SquareRoot12, @function
+SquareRoot12:
+	mtc2	$a0, C2_LZCS
+	nop
+	nop
+	mfc2	$v0, C2_LZCR
+	beq		$v0, 32, $bad_sqr12
+	nop
+	andi	$t0, $v0, 0x1
+	addiu	$v1, $0 , -2
+	and		$t2, $v0, $v1
+	li		$t1, 19
+	sub		$t1, $t2
+	sra		$t1, 1
+	addi	$t3, $t2, -24
+	bltz	$t3, $value_less12
+	nop
+	sllv	$t4, $a0, $t3
+	b		$value_greater12
+$value_less12:
+	addiu	$t3, $0 , 24
+	sub		$t3, $t2
+	srav	$t4, $a0, $t3
+$value_greater12:
+	addi	$t4, -64
+	sll		$t4, 1
+	la		$t5, sqrt_table
+	addu	$t5, $t4
+	lh		$t5, 0($t5)
+	nop
+	
+	bltz	$t1, $1594c
+	nop
+	jr		$ra
+	sllv	$v0, $t5, $t1
+	
+$1594c:
+
+	sub		$t1, $0 , $t1
+	jr		$ra
+	srl		$v0, $t5, $t1
+	
+$bad_sqr12:
+	jr		$ra
+	move	$v0, $0
+	
+	
+.global SquareRoot0
+.type SquareRoot0, @function
+SquareRoot0:
+	mtc2	$a0, C2_LZCS
+	nop
+	nop
+	mfc2	$v0, C2_LZCR
+	beq		$v0, 32, $bad_sqr
+	nop
+	andi	$t0, $v0, 0x1
+	addiu	$v1, $0 , -2
+	and		$t2, $v0, $v1
+	li		$t1, 31
+	sub		$t1, $t2
+	sra		$t1, 1
+	addi	$t3, $t2, -24
+	bltz	$t3, $value_less
+	nop
+	sllv	$t4, $a0, $t3
+	b		$value_greater
+$value_less:
+	addiu	$t3, $0 , 24
+	sub		$t3, $t2
+	srav	$t4, $a0, $t3
+$value_greater:
+	addi	$t4, -64
+	sll		$t4, 1
+	la		$t5, sqrt_table
+	addu	$t5, $t4
+	lh		$t5, 0($t5)
+	nop
+	sllv	$t5, $t5, $t1
+	jr		$ra
+	srl		$v0, $t5, 12
+$bad_sqr:
+	jr		$ra
+	move	$v0, $0
+	
+
+.section .data
+	
+sqrt_table:
+	.hword	0x1000,0x101f,0x103f,0x105e,0x107e,0x109c,0x10bb,0x10da
+	.hword	0x10f8,0x1116,0x1134,0x1152,0x116f,0x118c,0x11a9,0x11c6
+	.hword	0x11e3,0x1200,0x121c,0x1238,0x1254,0x1270,0x128c,0x12a7
+	.hword	0x12c2,0x12de,0x12f9,0x1314,0x132e,0x1349,0x1364,0x137e
+	.hword	0x1398,0x13b2,0x13cc,0x13e6,0x1400,0x1419,0x1432,0x144c
+	.hword	0x1465,0x147e,0x1497,0x14b0,0x14c8,0x14e1,0x14f9,0x1512
+	.hword	0x152a,0x1542,0x155a,0x1572,0x158a,0x15a2,0x15b9,0x15d1
+	.hword	0x15e8,0x1600,0x1617,0x162e,0x1645,0x165c,0x1673,0x1689
+	.hword	0x16a0,0x16b7,0x16cd,0x16e4,0x16fa,0x1710,0x1726,0x173c
+	.hword	0x1752,0x1768,0x177e,0x1794,0x17aa,0x17bf,0x17d5,0x17ea
+	.hword	0x1800,0x1815,0x182a,0x183f,0x1854,0x1869,0x187e,0x1893
+	.hword	0x18a8,0x18bd,0x18d1,0x18e6,0x18fa,0x190f,0x1923,0x1938
+	.hword	0x194c,0x1960,0x1974,0x1988,0x199c,0x19b0,0x19c4,0x19d8
+	.hword	0x19ec,0x1a00,0x1a13,0x1a27,0x1a3a,0x1a4e,0x1a61,0x1a75
+	.hword	0x1a88,0x1a9b,0x1aae,0x1ac2,0xa1d5,0x1ae8,0x1afb,0x1b0e
+	.hword	0x1b21,0x1b33,0x1b46,0x1b59,0x1b6c,0x1b7e,0x1b91,0x1ba3
+	.hword	0x1bb6,0x1bc8,0x1bdb,0x1bed,0x1c00,0x1c12,0x1c24,0x1c36
+	.hword	0x1c48,0x1c5a,0x1c6c,0x1c7e,0x1c90,0x1ca2,0x1cb4,0x1cc6
+	.hword	0x1cd8,0x1ce9,0x1cfb,0x1d0d,0x1d1e,0x1d30,0x1d41,0x1d53
+	.hword	0x1d64,0x1d76,0x1d87,0x1d98,0x1daa,0x1dbb,0x1dcc,0x1ddd
+	.hword	0x1dee,0x1e00,0x1e11,0x1e22,0x1e33,0x1e43,0x1e54,0x1e65
+	.hword	0x1e76,0x1e87,0x1e98,0x1ea8,0x1eb9,0x1eca,0x1eda,0x1eeb
+	.hword	0x1efb,0x1f0c,0x1f1c,0x1f2d,0x1f3d,0x1f4e,0x1f5e,0x1f6e
+	.hword	0x1f7e,0x1f8f,0x1f9f,0x1faf,0x1fbf,0x1fcf,0x1fdf,0x1fef
diff --git a/libmeidogte/vectornormals.s b/libmeidogte/vectornormals.s
new file mode 100644
index 0000000..0dbe1e8
--- /dev/null
+++ b/libmeidogte/vectornormals.s
@@ -0,0 +1,107 @@
+.set noreorder
+.set noat
+
+.include "gtereg.h"
+.include "inline_s.h"
+
+.section .text
+
+
+.global VectorNormalS
+.type VectorNormalS, @function
+VectorNormalS:
+	lw		$t0, 0($a0)
+	lw		$t1, 4($a0)
+	lw		$t2, 8($a0)
+	
+	mtc2	$t0, C2_IR1
+	mtc2	$t1, C2_IR2
+	mtc2	$t2, C2_IR3
+	
+	nSQR(0)
+	
+	mfc2	$t3, C2_MAC1
+	mfc2	$t4, C2_MAC2
+	mfc2	$t5, C2_MAC3
+	
+	add		$t3, $t4
+	add		$v0, $t3, $t5
+	mtc2	$v0, C2_LZCS
+	nop
+	nop
+	mfc2	$v1, C2_LZCR
+	
+	addiu	$at, $0 , -2
+	and		$v1, $at
+	
+	addiu	$t6, $0 , 0x1f
+	sub		$t6, $v1
+	sra		$t6, 1
+	addiu	$t3, $v1, -24
+	
+	bltz	$t3, $value_neg
+	nop
+	b		$value_pos
+	sllv	$t4, $v0, $t3
+$value_neg:
+	addiu	$t3, $0 , 24
+	sub		$t3, $v1
+	srav	$t4, $v0, $t3
+$value_pos:
+	addi	$t4, -64
+	sll		$t4, 1
+	
+	la		$t5, _norm_table
+	addu	$t5, $t4
+	lh		$t5, 0($t5)
+	nop
+	
+	mtc2	$t5, C2_IR0
+	mtc2	$t0, C2_IR1
+	mtc2	$t1, C2_IR2
+	mtc2	$t2, C2_IR3
+	
+	nGPF(0)
+	
+	mfc2	$t0, C2_MAC1
+	mfc2	$t1, C2_MAC2
+	mfc2	$t2, C2_MAC3
+	
+	sra		$t0, $t6
+	sra		$t1, $t6
+	sra		$t2, $t6
+	
+	sh		$t0, 0($a1)
+	sh		$t1, 2($a1)
+	jr		$ra
+	sh		$t2, 4($a1)
+
+
+.section .data
+
+_norm_table:
+	.hword	0x1000, 0x0FE0, 0x0FC1, 0x0FA3, 0x0F85, 0x0F68, 0x0F4C, 0x0F30
+	.hword	0x0F15, 0x0EFB, 0x0EE1, 0x0EC7, 0x0EAE, 0x0E96, 0x0E7E, 0x0E66
+	.hword	0x0E4F, 0x0E38, 0x0E22, 0x0E0C, 0x0DF7, 0x0DE2, 0x0DCD, 0x0DB9
+	.hword	0x0DA5, 0x0D91, 0x0D7E, 0x0D6B, 0x0D58, 0x0D45, 0x0D33, 0x0D21
+	.hword	0x0D10, 0x0CFF, 0x0CEE, 0x0CDD, 0x0CCC, 0x0CBC, 0x0CAC, 0x0C9C
+	.hword	0x0C8D, 0x0C7D, 0x0C6E, 0x0C5F, 0x0C51, 0x0C42, 0x0C34, 0x0C26
+	.hword	0x0C18, 0x0C0A, 0x0BFD, 0x0BEF, 0x0BE2, 0x0BD5, 0x0BC8, 0x0BBB
+	.hword	0x0BAF, 0x0BA2, 0x0B96, 0x0B8A, 0x0B7E, 0x0B72, 0x0B67, 0x0B5B
+	.hword	0x0B50, 0x0B45, 0x0B39, 0x0B2E, 0x0B24, 0x0B19, 0x0B0E, 0x0B04
+	.hword	0x0AF9, 0x0AEF, 0x0AE5, 0x0ADB, 0x0AD1, 0x0AC7, 0x0ABD, 0x0AB4
+	.hword	0x0AAA, 0x0AA1, 0x0A97, 0x0A8E, 0x0A85, 0x0A7C, 0x0A73, 0x0A6A
+	.hword	0x0A61, 0x0A59, 0x0A50, 0x0A47, 0x0A3F, 0x0A37, 0x0A2E, 0x0A26
+	.hword	0x0A1E, 0x0A16, 0x0A0E, 0x0A06, 0x09FE, 0x09F6, 0x09EF, 0x09E7
+	.hword	0x09E0, 0x09D8, 0x09D1, 0x09C9, 0x09C2, 0x09BB, 0x09B4, 0x09AD
+	.hword	0x09A5, 0x099E, 0x0998, 0x0991, 0x098A, 0x0983, 0x097C, 0x0976
+	.hword	0x096F, 0x0969, 0x0962, 0x095C, 0x0955, 0x094F, 0x0949, 0x0943
+	.hword	0x093C, 0x0936, 0x0930, 0x092A, 0x0924, 0x091E, 0x0918, 0x0912
+	.hword	0x090D, 0x0907, 0x0901, 0x08FB, 0x08F6, 0x08F0, 0x08EB, 0x08E5
+	.hword	0x08E0, 0x08DA, 0x08D5, 0x08CF, 0x08CA, 0x08C5, 0x08BF, 0x08BA
+	.hword	0x08B5, 0x08B0, 0x08AB, 0x08A6, 0x08A1, 0x089C, 0x0897, 0x0892
+	.hword	0x088D, 0x0888, 0x0883, 0x087E, 0x087A, 0x0875, 0x0870, 0x086B
+	.hword	0x0867, 0x0862, 0x085E, 0x0859, 0x0855, 0x0850, 0x084C, 0x0847
+	.hword	0x0843, 0x083E, 0x083A, 0x0836, 0x0831, 0x082D, 0x0829, 0x0824
+	.hword	0x0820, 0x081C, 0x0818, 0x0814, 0x0810, 0x080C, 0x0808, 0x0804
+