Add BS v3 decoding, fix MDEC API and strvideo example

author: spicyjpeg <thatspicyjpeg@gmail.com> 2023-01-23 09:36:22 +0100
committer: spicyjpeg <thatspicyjpeg@gmail.com> 2023-01-23 09:36:22 +0100
commit: 09f321e37fc187affa664d32e36e32c0533a7e8e (patch)
tree: 27f846c194d92a9f4f8e3daea4ff2317e3e66894
parent: a21e949c9aea98cb4b3feee48bb98579bbdfba70 (diff)
download: psn00bsdk-09f321e37fc187affa664d32e36e32c0533a7e8e.tar.gz
8 files changed, 763 insertions, 449 deletions
diff --git a/examples/mdec/strvideo/main.c b/examples/mdec/strvideo/main.c
index 28d39b2..57cb6ef 100644
--- a/examples/mdec/strvideo/main.c
+++ b/examples/mdec/strvideo/main.c
@@ -1,6 +1,6 @@
 /*
  * PSn00bSDK .STR FMV playback example
- * (C) 2022 spicyjpeg - MPL licensed
+ * (C) 2022-2023 spicyjpeg - MPL licensed
  *
  * This example demonstrates playback of full-motion video in the standard .STR
  * format, using the MDEC for frame decoding and XA for audio. Decoded frames
@@ -34,9 +34,10 @@
  * Playback is stopped once the .STR header is no longer present in sectors
  * read.
  *
- * Note that PSn00bSDK's bitstream decoding API only supports version 1 and 2
- * bitstreams currently, so make sure your .STR files are encoded as v2 and not
- * v3.
+ * PSn00bSDK's bitstream decoding API supports both version 2 and 3 bitstreams.
+ * Encoding your .STR files as v3 may result in slightly higher quality
+ * depending on the encoder, but also higher CPU usage during playback compared
+ * to the older v2.
  */
 
 #include <stdint.h>
@@ -102,13 +103,12 @@ void init_context(RenderContext *ctx) {
 	FntOpen(4, 12, 312, 16, 2, 256);
 }
 
-void display(RenderContext *ctx, int sync) {
+void display(RenderContext *ctx) {
 	Framebuffer *db;
 	ctx->db_active ^= 1;
 
 	DrawSync(0);
-	if (sync)
-		VSync(0);
+	//VSync(0);
 
 	db = &(ctx->db[ctx->db_active]);
 	PutDrawEnv(&(db->draw));
@@ -163,13 +163,13 @@ typedef struct {
 	volatile int8_t cur_frame, cur_slice;
 } StreamContext;
 
-StreamContext str_ctx;
+static StreamContext str_ctx;
 
 // This buffer is used by cd_sector_handler() as a temporary area for sectors
 // read from the CD. Due to DMA limitations it can't be allocated on the stack
 // (especially not in the interrupt callbacks' stack, whose size is very
 // limited).
-STR_Header sector_header;
+static STR_Header sector_header;
 
 void cd_sector_handler(void) {
 	StreamBuffer *frame = &str_ctx.frames[str_ctx.cur_frame];
@@ -268,7 +268,7 @@ void init_stream(void) {
 	// optional but makes the decompressor slightly faster. See the libpsxpress
 	// documentation for more details.
 	DecDCTvlcSize(0x8000);
-	DecDCTvlcCopyTable((DECDCTTAB *) 0x1f800000);
+	DecDCTvlcCopyTableV3((VLC_TableV3 *) 0x1f800000);
 
 	str_ctx.cur_frame = 0;
 	str_ctx.cur_slice = 0;
@@ -309,7 +309,7 @@ void start_stream(CdlFILE *file) {
 
 static RenderContext ctx;
 
-#define SHOW_STATUS(...) { FntPrint(-1, __VA_ARGS__); FntFlush(-1); display(&ctx, 1); }
+#define SHOW_STATUS(...) { FntPrint(-1, __VA_ARGS__); FntFlush(-1); display(&ctx); }
 #define SHOW_ERROR(...)  { SHOW_STATUS(__VA_ARGS__); while (1) __asm__("nop"); }
 
 int main(int argc, const char* argv[]) {
@@ -318,7 +318,7 @@ int main(int argc, const char* argv[]) {
 	SHOW_STATUS("INITIALIZING\n");
 	SpuInit();
 	CdInit();
-	InitGeom(); // Required for PSn00bSDK's DecDCTvlc()
+	InitGeom(); // GTE initialization required by the VLC decompressor
 	DecDCTReset(0);
 
 	SHOW_STATUS("OPENING VIDEO FILE\n");
@@ -330,8 +330,9 @@ int main(int argc, const char* argv[]) {
 	init_stream();
 	start_stream(&file);
 
-	// Disable framebuffer clearing to get rid of flickering during playback.
-	display(&ctx, 1);
+	// Clear the screen, then disable framebuffer clearing to get rid of
+	// flickering during playback.
+	display(&ctx);
 	ctx.db[0].draw.isbg = 0;
 	ctx.db[1].draw.isbg = 0;
 #ifdef DISP_24BPP
@@ -339,9 +340,13 @@ int main(int argc, const char* argv[]) {
 	ctx.db[1].disp.isrgb24 = 1;
 #endif
 
-	int decode_errors = 0;
+	int frame_time = 1, decode_errors = 0;
 
 	while (1) {
+#ifdef DRAW_OVERLAY
+		int frame_start = TIMER_VALUE(1);
+#endif
+
 		// Wait for a full frame to be read from the disc and decompress the
 		// bitstream into the format expected by the MDEC. If the video has
 		// ended, restart playback from the beginning.
@@ -355,38 +360,45 @@ int main(int argc, const char* argv[]) {
 		}
 
 #ifdef DRAW_OVERLAY
-		// Measure CPU usage of the decompressor using the hblank counter.
-		int total_time = TIMER_VALUE(1) + 1;
-		TIMER_VALUE(1) = 0;
+		int decode_time = TIMER_VALUE(1);
 #endif
 
-		if (DecDCTvlc(frame->bs_data, frame->mdec_data)) {
+		VLC_Context vlc_ctx;
+		if (DecDCTvlcStart(
+			&vlc_ctx,
+			frame->mdec_data,
+			sizeof(frame->mdec_data) / 4,
+			frame->bs_data
+		)) {
 			decode_errors++;
 			continue;
 		}
 
 #ifdef DRAW_OVERLAY
-		int cpu_usage = TIMER_VALUE(1) * 100 / total_time;
+		// Calculate CPU usage of the decompressor.
+		decode_time   = (TIMER_VALUE(1) - decode_time) & 0xffff;
+		int cpu_usage = decode_time * 100 / frame_time;
 #endif
 
 		// Wait for the MDEC to finish decoding the previous frame, then flip
 		// the framebuffers to display it and prepare the buffer for the next
 		// frame.
-		// NOTE: you should *not* call VSync(0) during playback, as the refresh
-		// rate of the GPU is not synced to the video's frame rate. If you want
-		// to minimize screen tearing, consider triple buffering instead (i.e.
-		// always keep 2 fully decoded frames in VRAM and use VSyncCallback()
-		// to register a function that displays the next decoded frame whenever
-		// vblank occurs).
+		// NOTE: as the refresh rate of the GPU is not synced to the video's
+		// frame rate, this VSync(0) call may potentially end up waiting too
+		// long and desynchronizing playback. A better solution would be to
+		// implement triple buffering (i.e. always keep 2 fully decoded frames
+		// in VRAM and use VSyncCallback() to register a function that displays
+		// the next decoded frame if available whenever vblank occurs).
+		VSync(0);
 		DecDCTinSync(0);
 		DecDCToutSync(0);
 
 #ifdef DRAW_OVERLAY
-		FntPrint(-1, "FRAME:%5d    READ ERRORS:  %5d\n", str_ctx.frame_id, str_ctx.dropped_frames);
-		FntPrint(-1, "CPU:  %5d%%   DECODE ERRORS:%5d\n", cpu_usage, decode_errors);
+		FntPrint(-1, "FRAME:%6d      READ ERRORS:  %6d\n", str_ctx.frame_id, str_ctx.dropped_frames);
+		FntPrint(-1, "CPU:  %6d%%     DECODE ERRORS:%6d\n", cpu_usage, decode_errors);
 		FntFlush(-1);
 #endif
-		display(&ctx, 0);
+		display(&ctx);
 
 		// Feed the newly decompressed frame to the MDEC. The MDEC will not
 		// actually start decoding it until an output buffer is also configured
@@ -414,6 +426,10 @@ int main(int argc, const char* argv[]) {
 			str_ctx.slices[str_ctx.cur_slice],
 			BLOCK_SIZE * str_ctx.slice_pos.h / 2
 		);
+
+#ifdef DRAW_OVERLAY
+		frame_time = (TIMER_VALUE(1) - frame_start) & 0xffff;
+#endif
 	}
 
 	return 0;
diff --git a/libpsn00b/include/psxpress.h b/libpsn00b/include/psxpress.h
index dc1d52c..c3b13f4 100644
--- a/libpsn00b/include/psxpress.h
+++ b/libpsn00b/include/psxpress.h
@@ -1,6 +1,6 @@
 /*
  * PSn00bSDK MDEC library
- * (C) 2022 spicyjpeg - MPL licensed
+ * (C) 2022-2023 spicyjpeg - MPL licensed
  */
 
 /**
@@ -17,7 +17,9 @@
  * FMV playback is not part of this library per se, but can implemented using
  * the APIs defined here alongside some code to stream data from the CD drive.
  *
- * Currently only version 1 and 2 .BS files are supported.
+ * Currently bitstream versions 1, 2 and 3 are supported. Version 0 and .IKI
+ * bitstreams are not supported, but no encoder is publicly available for those
+ * anyway.
  */
 
 #ifndef __PSXPRESS_H
@@ -34,28 +36,26 @@ typedef struct _DECDCTENV {
 	int16_t dct[64];	// Inverse DCT matrix (2.14 fixed-point)
 } DECDCTENV;
 
-// This is the "small" lookup table used by DecDCTvlc(). It can be copied to
-// the scratchpad.
+typedef struct _VLC_TableV2 {
+	uint16_t ac0[2];
+	uint32_t ac2[8], ac3[64];
+	uint16_t ac4[8], ac5[8], ac7[16], ac8[32];
+	uint16_t ac9[32], ac10[32], ac11[32], ac12[32];
+} VLC_TableV2;
+
+typedef struct _VLC_TableV3 {
+	uint16_t ac0[2];
+	uint32_t ac2[8], ac3[64];
+	uint16_t ac4[8], ac5[8], ac7[16], ac8[32];
+	uint16_t ac9[32], ac10[32], ac11[32], ac12[32];
+	uint8_t  dc[128], dc_len[9];
+	uint8_t  _reserved[3];
+} VLC_TableV3;
+
 typedef struct _DECDCTTAB {
-	uint16_t	lut0[2];
-	uint32_t	lut2[8];
-	uint32_t	lut3[64];
-	uint16_t	lut4[8];
-	uint16_t	lut5[8];
-	uint16_t	lut7[16];
-	uint16_t	lut8[32];
-	uint16_t	lut9[32];
-	uint16_t	lut10[32];
-	uint16_t	lut11[32];
-	uint16_t	lut12[32];
+	uint32_t ac[8192], ac00[512];
 } DECDCTTAB;
 
-// This is the "large" table used by DecDCTvlc2().
-typedef struct _DECDCTTAB2 {
-	uint32_t	lut[8192];
-	uint32_t	lut00[512];
-} DECDCTTAB2;
-
 typedef enum _DECDCTMODE {
 	DECDCT_MODE_24BPP		= 1,
 	DECDCT_MODE_16BPP		= 0,
@@ -66,8 +66,9 @@ typedef enum _DECDCTMODE {
 typedef struct _VLC_Context {
 	const uint32_t	*input;
 	uint32_t		window, next_window, remaining;
-	uint16_t		quant_scale;
 	int8_t			is_v3, bit_offset, block_index, coeff_index;
+	uint16_t		quant_scale;
+	int16_t			last_y, last_cr, last_cb;
 } VLC_Context;
 
 // Despite what some docs claim, the "number of 32-byte blocks" and "always
@@ -233,8 +234,9 @@ int DecDCToutSync(int mode);
  * frame) into a buffer that can be passed to DecDCTin(). This function uses a
  * small (<1 KB) lookup table combined with the GTE to accelerate the process;
  * performance is roughly on par with DecDCTvlcStart2() if the lookup table
- * is copied to the scratchpad beforehand by calling DecDCTvlcCopyTable(). The
- * contents of the GTE's LZCR register, if any, will be destroyed.
+ * is copied to the scratchpad beforehand by calling DecDCTvlcCopyTableV2() or
+ * DecDCTvlcCopyTableV3(). The contents of the GTE's LZCS and LZCR registers,
+ * if any, will be destroyed.
  *
  * A VLC_Context object must be created and passed to this function, which will
  * then proceed to initialize its fields. The max_size argument sets the
@@ -256,7 +258,7 @@ int DecDCToutSync(int mode);
  * @param bs
  * @return 0, 1 if more data needs to be output or -1 in case of failure
  *
- * @see DecDCTvlcContinue(), DecDCTvlcCopyTable()
+ * @see DecDCTvlcContinue(), DecDCTvlcCopyTableV2(), DecDCTvlcCopyTableV3()
  */
 int DecDCTvlcStart(VLC_Context *ctx, uint32_t *buf, size_t max_size, const uint32_t *bs);
 
@@ -275,7 +277,8 @@ int DecDCTvlcStart(VLC_Context *ctx, uint32_t *buf, size_t max_size, const uint3
  * context returned 0; in that case the context shall be discarded or reused to
  * decode another bitstream.
  *
- * The contents of the GTE's LZCR register, if any, will be destroyed.
+ * The contents of the GTE's LZCS and LZCR registers, if any, will be
+ * destroyed.
  *
  * See DecDCTvlcStart() for more details.
  *
@@ -309,7 +312,7 @@ int DecDCTvlcContinue(VLC_Context *ctx, uint32_t *buf, size_t max_size);
  * @param buf
  * @return 0, 1 if more data needs to be output or -1 in case of failure
  *
- * @see DecDCTvlcSize(), DecDCTvlcCopyTable()
+ * @see DecDCTvlcSize(), DecDCTvlcCopyTableV2(), DecDCTvlcCopyTableV3()
  */
 int DecDCTvlc(const uint32_t *bs, uint32_t *buf);
 
@@ -332,23 +335,60 @@ int DecDCTvlc(const uint32_t *bs, uint32_t *buf);
 size_t DecDCTvlcSize(size_t size);
 
 /**
- * @brief Moves the lookup table used by the .BS decompressor to the scratchpad
- * region.
+ * @brief Copies the lookup tables used by the .BS decompressor (v1/v2) to the
+ * scratchpad region.
+ *
+ * @details Copies the lookup table used by DecDCTvlcContinue(),
+ * DecDCTvlcStart() and DecDCTvlc() to the specified address. A copy of this
+ * table is always present in main RAM, however this function can be used to
+ * copy it to the scratchpad region to boost decompression performance.
+ *
+ * This function copies a 676-byte table (VLC_TableV2 structure) containing
+ * only the data necessary for decoding version 1 and 2 bitstreams, to help
+ * save scratchpad space. If support for version 3 is required,
+ * DecDCTvlcCopyTableV3() can be used instead to copy the full 816-byte table.
+ *
+ * The address passed to this function is saved. Calls to DecDCTvlcStart(),
+ * DecDCTvlcContinue() and DecDCTvlc() will automatically use the last table
+ * copied. Call DecDCTvlcCopyTableV2(0) or DecDCTvlcCopyTableV3(0) to revert to
+ * using the library's internal table in main RAM.
+ *
+ * WARNING: attempting to decode a version 3 .BS file or .STR frame after
+ * calling this function will result in undefined behavior and potentially a
+ * crash. To re-enable version 3 decoding, use DecDCTvlcCopyTableV3() to copy
+ * the full table to the scratchpad or revert to using the built-in table in
+ * main RAM.
+ *
+ * @param addr Pointer to free 676-byte area in scratchpad region or 0 to reset
+ *
+ * @see DecDCTvlcCopyTableV3()
+ */
+void DecDCTvlcCopyTableV2(VLC_TableV2 *addr);
+
+/**
+ * @brief Copies the lookup tables used by the .BS decompressor (v1/v2/v3) to
+ * the scratchpad region.
  *
- * @details Copies the small (<1 KB) lookup table used by DecDCTvlcContinue(),
- * DecDCTvlcStart() and DecDCTvlc() (a DECDCTTAB structure) to the specified
- * address. A copy of this table is always present in main RAM, however this
- * function can be used to copy it to the scratchpad region to boost
- * decompression performance.
+ * @details Copies the lookup table used by DecDCTvlcContinue(),
+ * DecDCTvlcStart() and DecDCTvlc() to the specified address. A copy of this
+ * table is always present in main RAM, however this function can be used to
+ * copy it to the scratchpad region to boost decompression performance.
+ *
+ * This function copies the full 816-byte table (VLC_TableV3 structure),
+ * including the data used to decode version 3 bitstreams. If support for
+ * version 3 is not required, DecDCTvlcCopyTableV2() can be used instead to
+ * save scratchpad space by only copying the first 676 bytes of the table.
  *
  * The address passed to this function is saved. Calls to DecDCTvlcStart(),
  * DecDCTvlcContinue() and DecDCTvlc() will automatically use the last table
- * copied. Call DecDCTvlcCopyTable(0) to revert to using the library's internal
- * table in main RAM.
+ * copied. Call DecDCTvlcCopyTableV2(0) or DecDCTvlcCopyTableV3(0) to revert to
+ * using the library's internal table in main RAM.
+ *
+ * @param addr Pointer to free 816-byte area in scratchpad region or 0 to reset
  *
- * @param addr Pointer to free area in scratchpad region or 0 to reset
+ * @see DecDCTvlcCopyTableV2()
  */
-void DecDCTvlcCopyTable(DECDCTTAB *addr);
+void DecDCTvlcCopyTableV3(VLC_TableV3 *addr);
 
 /**
  * @brief Decompresses or begins decompressing a .BS file into MDEC codes
@@ -360,8 +400,8 @@ void DecDCTvlcCopyTable(DECDCTTAB *addr);
  * calling DecDCTvlcBuild(), but does not use the GTE nor the scratchpad.
  * Depending on the specific bitstream being decoded DecDCTvlcStart2() might be
  * slightly faster or slower than DecDCTvlcStart() with its lookup table copied
- * to the scratchpad (see DecDCTvlcCopyTable()). DecDCTvlcStart() with the
- * table in main RAM tends to be much slower.
+ * to the scratchpad (see DecDCTvlcCopyTableV2() and DecDCTvlcCopyTableV3()).
+ * DecDCTvlcStart() with the table in main RAM tends to be much slower.
  *
  * A VLC_Context object must be created and passed to this function, which will
  * then proceed to initialize its fields. The max_size argument sets the
@@ -432,7 +472,7 @@ int DecDCTvlcContinue2(VLC_Context *ctx, uint32_t *buf, size_t max_size);
  *
  * @see DecDCTvlcSize2(), DecDCTvlcBuild()
  */
-int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB2 *table);
+int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB *table);
 
 /**
  * @brief Sets the maximum amount of data to be decompressed (alternate
@@ -458,7 +498,7 @@ size_t DecDCTvlcSize2(size_t size);
  * the .BS decompressor.
  *
  * @details Generates the lookup table required by DecDCTvlcStart2(),
- * DecDCTvlcContinue2() and DecDCTvlc2() (a DECDCTTAB2 structure) into the
+ * DecDCTvlcContinue2() and DecDCTvlc2() (a DECDCTTAB structure) into the
  * specified buffer. Since the table is relatively large (34 KB), it is
  * recommended to only generate it in a dynamically-allocated buffer when
  * needed and deallocate the buffer afterwards.
@@ -468,7 +508,7 @@ size_t DecDCTvlcSize2(size_t size);
  *
  * @param table
  */
-void DecDCTvlcBuild(DECDCTTAB2 *table);
+void DecDCTvlcBuild(DECDCTTAB *table);
 
 #ifdef __cplusplus
 }
diff --git a/libpsn00b/psxpress/README.md b/libpsn00b/psxpress/README.md
index a894874..df18ec5 100644
--- a/libpsn00b/psxpress/README.md
+++ b/libpsn00b/psxpress/README.md
@@ -1,14 +1,19 @@
 
 # PSn00bSDK MDEC library
 
-This is a fully open source reimplementation of the official SDK's "data
+This is a fully original reimplementation of the official SDK's "data
 compression" library. This library is made up of two parts, the MDEC API and
 functions to decompress Huffman-encoded bitstreams (.BS files, or frames in
-.STR files) into data to be fed to the MDEC. FMV playback is not part of this
-library (nor the official one) per se, but can implemented by using these APIs
-alongside some code to stream data from the CD drive.
+.STR files) into data to be fed to the MDEC. Two different implementations of
+the latter are provided, one using the GTE and scratchpad region and an older
+one using a large lookup table in main RAM.
 
-**Currently only version 1 and 2 bitstreams are supported**.
+FMV playback is not part of this library per se, but can implemented using the
+APIs defined here alongside some code to stream data from the CD drive.
+
+Currently bitstream versions 1, 2 and 3 are supported. Version 0 and .IKI
+bitstreams are not supported, but no encoder is publicly available for those
+anyway.
 
 ## MDEC API
 
@@ -26,14 +31,16 @@ The following functions are currently provided:
 
 - `DecDCTvlcStart()`, `DecDCTvlcContinue()`: a decompressor implementation that
   uses a small (<1 KB) lookup table and leverages the GTE, written in assembly.
-  `DecDCTvlcCopyTable()` can optionally be called to temporarily move the table
-  to the scratchpad region to improve decompression speed.
-- `DecDCTvlcStart2()`, `DecDCTvlcContinue2()`: a different implementation using
+  `DecDCTvlcCopyTableV2()` or `DecDCTvlcCopyTableV3()` may optionally be called
+  to temporarily move the table to the scratchpad region in order to boost
+  decompression speed.
+- `DecDCTvlcStart2()`, `DecDCTvlcContinue2()`: an older implementation using
   a large (34 KB) lookup table in main RAM, written in C. The table must be
-  decompressed ahead of time using `DecDCTvlcBuild()`, but can be deallocated
-  when no longer needed.
+  decompressed ahead of time manually using `DecDCTvlcBuild()`, but can be
+  deallocated when no longer needed. **This implementation does not support**
+  **version 3 bitstreams**.
 - `DecDCTvlc()`, `DecDCTvlc2()`: wrappers around the functions listed above,
-  for compatibility with the Sony SDK. Using them is not recommended.
+  for compatibility with the Sony SDK.
 
 ## SPU ADPCM encoding API
 
diff --git a/libpsn00b/psxpress/mdec.c b/libpsn00b/psxpress/mdec.c
index 3596188..394a0ce 100644
--- a/libpsn00b/psxpress/mdec.c
+++ b/libpsn00b/psxpress/mdec.c
@@ -1,12 +1,11 @@
 /*
  * PSn00bSDK MDEC library (low-level MDEC/DMA API)
- * (C) 2022 spicyjpeg - MPL licensed
+ * (C) 2022-2023 spicyjpeg - MPL licensed
  */
 
 #include <stdint.h>
 #include <assert.h>
 #include <psxetc.h>
-#include <psxapi.h>
 #include <psxpress.h>
 #include <hwregs_c.h>
 
@@ -15,14 +14,14 @@
 
 /* Default IDCT matrix and quantization tables */
 
-#define S0 0x5a82	// 0x4000 * cos(0/16 * pi) * sqrt(2)
-#define S1 0x7d8a	// 0x4000 * cos(1/16 * pi) * 2
-#define S2 0x7641	// 0x4000 * cos(2/16 * pi) * 2
-#define S3 0x6a6d	// 0x4000 * cos(3/16 * pi) * 2
-#define S4 0x5a82	// 0x4000 * cos(4/16 * pi) * 2
-#define S5 0x471c	// 0x4000 * cos(5/16 * pi) * 2
-#define S6 0x30fb	// 0x4000 * cos(6/16 * pi) * 2
-#define S7 0x18f8	// 0x4000 * cos(7/16 * pi) * 2
+#define S0 0x5a82	// (1 << 14) * cos(0/16 * pi) * sqrt(2)
+#define S1 0x7d8a	// (1 << 14) * cos(1/16 * pi) * 2
+#define S2 0x7641	// (1 << 14) * cos(2/16 * pi) * 2
+#define S3 0x6a6d	// (1 << 14) * cos(3/16 * pi) * 2
+#define S4 0x5a82	// (1 << 14) * cos(4/16 * pi) * 2
+#define S5 0x471c	// (1 << 14) * cos(5/16 * pi) * 2
+#define S6 0x30fb	// (1 << 14) * cos(6/16 * pi) * 2
+#define S7 0x18f8	// (1 << 14) * cos(7/16 * pi) * 2
 
 static const DECDCTENV _default_mdec_env = {
 	// The default luma and chroma quantization table is based on the MPEG-1
@@ -85,8 +84,6 @@ static const DECDCTENV _default_mdec_env = {
 /* Public API */
 
 void DecDCTReset(int mode) {
-	FastEnterCriticalSection();
-
 	SetDMAPriority(DMA_MDEC_IN,  3);
 	SetDMAPriority(DMA_MDEC_OUT, 3);
 	DMA_CHCR(DMA_MDEC_IN)  = 0x00000201; // Stop DMA
@@ -95,26 +92,28 @@ void DecDCTReset(int mode) {
 	MDEC1 = 0x80000000; // Reset MDEC
 	MDEC1 = 0x60000000; // Enable DMA in/out requests
 
-	FastExitCriticalSection();
 	if (!mode)
 		DecDCTPutEnv(0, 0);
 }
 
 void DecDCTPutEnv(const DECDCTENV *env, int mono) {
-	const DECDCTENV *_env = env ? env : &_default_mdec_env;
 	DecDCTinSync(0);
+	if (!env)
+		env = &_default_mdec_env;
 
 	MDEC0 = 0x60000000; // Set IDCT matrix
-	DecDCTinRaw((const uint32_t *) _env->dct, 32);
+	DecDCTinRaw((const uint32_t *) env->dct, 32);
 	DecDCTinSync(0);
 
-	MDEC0 = 0x40000000 | (mono ? 0 : 1); // Set table(s)
-	DecDCTinRaw((const uint32_t *) _env->iq_y, mono ? 16 : 32);
+	MDEC0 = 0x40000000 | (mono ? 0 : 1); // Set quantization table(s)
+	DecDCTinRaw((const uint32_t *) env->iq_y, mono ? 16 : 32);
 	DecDCTinSync(0);
 }
 
 void DecDCTin(const uint32_t *data, int mode) {
 	uint32_t header = *data;
+	DecDCTinSync(0);
+
 	if (mode == DECDCT_MODE_RAW)
 		MDEC0 = header;
 	else if (mode & DECDCT_MODE_24BPP)
@@ -153,7 +152,7 @@ int DecDCTinSync(int mode) {
 			return 0;
 	}
 
-	_sdk_log("DecDCTinSync() timeout\n");
+	_sdk_log("DecDCTinSync() timeout, MDEC1=0x%08x\n", MDEC1);
 	return -1;
 }
 
@@ -184,6 +183,6 @@ int DecDCToutSync(int mode) {
 			return 0;
 	}
 
-	_sdk_log("DecDCToutSync() timeout\n");
+	_sdk_log("DecDCToutSync() timeout, CHCR=0x%08x\n", DMA_CHCR(DMA_MDEC_OUT));
 	return -1;
 }
diff --git a/libpsn00b/psxpress/vlc.c b/libpsn00b/psxpress/vlc.c
index 4e3e283..36cfbe2 100644
--- a/libpsn00b/psxpress/vlc.c
+++ b/libpsn00b/psxpress/vlc.c
@@ -1,6 +1,6 @@
 /*
  * PSn00bSDK MDEC library (support code for the main VLC decompressor)
- * (C) 2022 spicyjpeg - MPL licensed
+ * (C) 2022-2023 spicyjpeg - MPL licensed
  */
 
 #include <stdint.h>
@@ -10,87 +10,120 @@
 
 /* Huffman code lookup table */
 
-#define _val1(rl, dc)		(((rl) << 10) | ((uint16_t) (dc) & 0x3ff))
-#define _val2(rl, dc, len)	(_val1(rl, dc) | (len << 16))
+#define _DC(y, c)			(((y) << 4) | (c))
+#define _AC(rl, dc)			(((rl) << 10) | ((uint16_t) (dc) & 0x3ff))
+#define _ACL(rl, dc, len)	(_AC(rl, dc) | ((len) << 16))
 
-#define _pair(rl, dc)		_val1(rl, dc), _val1(rl, -(dc))
-#define _pair2(rl, dc, len)	_val2(rl, dc, len), _val2(rl, -(dc), len)
-#define _pair3(rl, dc, len) \
-	_val2(rl, dc, len), _val2(rl, dc, len), \
-	_val2(rl, -(dc), len), _val2(rl, -(dc), len)
-#define _pair4(rl, dc, len) \
-	_val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), \
-	_val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), \
-	_val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), \
-	_val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len)
+#define _DC2(y, c)			_DC(y, c), _DC(y, c)
+#define _DC3(y, c)			_DC(y, c), _DC(y, c), _DC(y, c), _DC(y, c)
+#define _DC4(y, c) \
+	_DC(y, c), _DC(y, c), _DC(y, c), _DC(y, c), \
+	_DC(y, c), _DC(y, c), _DC(y, c), _DC(y, c)
+#define _AC2(rl, dc)		_AC(rl, dc), _AC(rl, -(dc))
+#define _ACL2(rl, dc, len)	_ACL(rl, dc, len), _ACL(rl, -(dc), len)
+#define _ACL3(rl, dc, len) \
+	_ACL(rl, dc, len), _ACL(rl, dc, len), \
+	_ACL(rl, -(dc), len), _ACL(rl, -(dc), len)
+#define _ACL4(rl, dc, len) \
+	_ACL(rl, dc, len), _ACL(rl, dc, len), _ACL(rl, dc, len), _ACL(rl, dc, len), \
+	_ACL(rl, dc, len), _ACL(rl, dc, len), _ACL(rl, dc, len), _ACL(rl, dc, len), \
+	_ACL(rl, -(dc), len), _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), \
+	_ACL(rl, -(dc), len), _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), _ACL(rl, -(dc), len)
 
 // This table isn't compressed since it makes no sense to compress less than a
 // kilobyte's worth of data.
-static const DECDCTTAB _default_huffman_table = {
-	.lut0 = {
+static const VLC_TableV3 _default_huffman_table = {
+	.ac0 = {
 		// 11 x
-		_pair( 0,  1)
+		_AC2( 0,  1)
 	},
-	.lut2 = {
+	.ac2 = {
 		// 01 0xx
-		_pair2( 0,  2, 5), _pair2( 2,  1, 5),
+		_ACL2( 0, 2, 5), _ACL2( 2, 1, 5),
 		// 01 1x-
-		_pair3( 1,  1, 4)
+		_ACL3( 1, 1, 4)
 	},
-	.lut3 = {
+	.ac3 = {
 		// 001 00xxxx
-		_pair2(13,  1, 9), _pair2( 0,  6, 9), _pair2(12,  1, 9), _pair2(11,  1, 9),
-		_pair2( 3,  2, 9), _pair2( 1,  3, 9), _pair2( 0,  5, 9), _pair2(10,  1, 9),
+		_ACL2(13, 1, 9), _ACL2( 0, 6, 9), _ACL2(12,  1, 9), _ACL2(11, 1, 9),
+		_ACL2( 3, 2, 9), _ACL2( 1, 3, 9), _ACL2( 0,  5, 9), _ACL2(10, 1, 9),
 		// 001 xxx---
-		_pair4( 0,  3, 6), _pair4( 4,  1, 6), _pair4( 3,  1, 6)
+		_ACL4( 0, 3, 6), _ACL4( 4, 1, 6), _ACL4( 3,  1, 6)
 	},
-	.lut4 = {
+	.ac4 = {
 		// 0001 xxx
-		_pair( 7,  1), _pair( 6,  1), _pair( 1,  2), _pair( 5,  1)
+		_AC2( 7,  1), _AC2( 6,  1), _AC2( 1,  2), _AC2( 5,  1)
 	},
-	.lut5 = {
+	.ac5 = {
 		// 00001 xxx
-		_pair( 2,  2), _pair( 9,  1), _pair( 0,  4), _pair( 8,  1)
+		_AC2( 2,  2), _AC2( 9,  1), _AC2( 0,  4), _AC2( 8,  1)
 	},
-	.lut7 = {
+	.ac7 = {
 		// 0000001 xxxx
-		_pair(16,  1), _pair( 5,  2), _pair( 0,  7), _pair( 2,  3),
-		_pair( 1,  4), _pair(15,  1), _pair(14,  1), _pair( 4,  2)
+		_AC2(16,  1), _AC2( 5,  2), _AC2( 0,  7), _AC2( 2,  3),
+		_AC2( 1,  4), _AC2(15,  1), _AC2(14,  1), _AC2( 4,  2)
 	},
-	.lut8 = {
+	.ac8 = {
 		// 00000001 xxxxx
-		_pair( 0, 11), _pair( 8,  2), _pair( 4,  3), _pair( 0, 10),
-		_pair( 2,  4), _pair( 7,  2), _pair(21,  1), _pair(20,  1),
-		_pair( 0,  9), _pair(19,  1), _pair(18,  1), _pair( 1,  5),
-		_pair( 3,  3), _pair( 0,  8), _pair( 6,  2), _pair(17,  1)
+		_AC2( 0, 11), _AC2( 8,  2), _AC2( 4,  3), _AC2( 0, 10),
+		_AC2( 2,  4), _AC2( 7,  2), _AC2(21,  1), _AC2(20,  1),
+		_AC2( 0,  9), _AC2(19,  1), _AC2(18,  1), _AC2( 1,  5),
+		_AC2( 3,  3), _AC2( 0,  8), _AC2( 6,  2), _AC2(17,  1)
 	},
-	.lut9 = {
+	.ac9 = {
 		// 000000001 xxxxx
-		_pair(10,  2), _pair( 9,  2), _pair( 5,  3), _pair( 3,  4),
-		_pair( 2,  5), _pair( 1,  7), _pair( 1,  6), _pair( 0, 15),
-		_pair( 0, 14), _pair( 0, 13), _pair( 0, 12), _pair(26,  1),
-		_pair(25,  1), _pair(24,  1), _pair(23,  1), _pair(22,  1)
+		_AC2(10,  2), _AC2( 9,  2), _AC2( 5,  3), _AC2( 3,  4),
+		_AC2( 2,  5), _AC2( 1,  7), _AC2( 1,  6), _AC2( 0, 15),
+		_AC2( 0, 14), _AC2( 0, 13), _AC2( 0, 12), _AC2(26,  1),
+		_AC2(25,  1), _AC2(24,  1), _AC2(23,  1), _AC2(22,  1)
 	},
-	.lut10 = {
+	.ac10 = {
 		// 0000000001 xxxxx
-		_pair( 0, 31), _pair( 0, 30), _pair( 0, 29), _pair( 0, 28),
-		_pair( 0, 27), _pair( 0, 26), _pair( 0, 25), _pair( 0, 24),
-		_pair( 0, 23), _pair( 0, 22), _pair( 0, 21), _pair( 0, 20),
-		_pair( 0, 19), _pair( 0, 18), _pair( 0, 17), _pair( 0, 16)
+		_AC2( 0, 31), _AC2( 0, 30), _AC2( 0, 29), _AC2( 0, 28),
+		_AC2( 0, 27), _AC2( 0, 26), _AC2( 0, 25), _AC2( 0, 24),
+		_AC2( 0, 23), _AC2( 0, 22), _AC2( 0, 21), _AC2( 0, 20),
+		_AC2( 0, 19), _AC2( 0, 18), _AC2( 0, 17), _AC2( 0, 16)
 	},
-	.lut11 = {
+	.ac11 = {
 		// 00000000001 xxxxx
-		_pair( 0, 40), _pair( 0, 39), _pair( 0, 38), _pair( 0, 37),
-		_pair( 0, 36), _pair( 0, 35), _pair( 0, 34), _pair( 0, 33),
-		_pair( 0, 32), _pair( 1, 14), _pair( 1, 13), _pair( 1, 12),
-		_pair( 1, 11), _pair( 1, 10), _pair( 1,  9), _pair( 1,  8)
+		_AC2( 0, 40), _AC2( 0, 39), _AC2( 0, 38), _AC2( 0, 37),
+		_AC2( 0, 36), _AC2( 0, 35), _AC2( 0, 34), _AC2( 0, 33),
+		_AC2( 0, 32), _AC2( 1, 14), _AC2( 1, 13), _AC2( 1, 12),
+		_AC2( 1, 11), _AC2( 1, 10), _AC2( 1,  9), _AC2( 1,  8)
 	},
-	.lut12 = {
+	.ac12 = {
 		// 000000000001 xxxxx
-		_pair( 1, 18), _pair( 1, 17), _pair( 1, 16), _pair( 1, 15),
-		_pair( 6,  3), _pair(16,  2), _pair(15,  2), _pair(14,  2),
-		_pair(13,  2), _pair(12,  2), _pair(11,  2), _pair(31,  1),
-		_pair(30,  1), _pair(29,  1), _pair(28,  1), _pair(27,  1)
+		_AC2( 1, 18), _AC2( 1, 17), _AC2( 1, 16), _AC2( 1, 15),
+		_AC2( 6,  3), _AC2(16,  2), _AC2(15,  2), _AC2(14,  2),
+		_AC2(13,  2), _AC2(12,  2), _AC2(11,  2), _AC2(31,  1),
+		_AC2(30,  1), _AC2(29,  1), _AC2(28,  1), _AC2(27,  1)
+	},
+	.dc = {
+		// 00-----
+		_DC4(1, 0), _DC4(1, 0), _DC4(1, 0), _DC4(1, 0),
+		// 01-----
+		_DC4(2, 1), _DC4(2, 1), _DC4(2, 1), _DC4(2, 1),
+		// 100----
+		_DC4(0, 2), _DC4(0, 2),
+		// 101----
+		_DC4(3, 2), _DC4(3, 2),
+		// 110----
+		_DC4(4, 3), _DC4(4, 3),
+		// 1110---
+		_DC4(5, 4),
+		// 11110--
+		_DC3(6, 5),
+		// 111110-
+		_DC2(7, 6),
+		// 1111110
+		_DC(8, 7),
+		// 1111111(0)
+		_DC(0, 8)
+	},
+	.dc_len = {
+		_DC(3, 2), _DC(2, 2), _DC(2, 2), _DC(3, 3),
+		_DC(3, 4), _DC(4, 5), _DC(5, 6), _DC(6, 7),
+		_DC(7, 8)
 	}
 };
 
@@ -100,7 +133,7 @@ static const DECDCTTAB _default_huffman_table = {
 static VLC_Context	_default_context;
 static size_t		_max_buffer_size = 0;
 
-const DECDCTTAB		*_vlc_huffman_table = &_default_huffman_table;
+const VLC_TableV3 *_vlc_huffman_table = &_default_huffman_table;
 
 /* Stateful VLC decoder API (for Sony SDK compatibility) */
 
@@ -120,10 +153,19 @@ size_t DecDCTvlcSize(size_t size) {
 
 /* Lookup table relocation API */
 
-void DecDCTvlcCopyTable(DECDCTTAB *addr) {
+void DecDCTvlcCopyTableV2(VLC_TableV2 *addr) {
+	if (addr) {
+		_vlc_huffman_table = (const VLC_TableV3 *) addr;
+		memcpy(addr, &_default_huffman_table, sizeof(VLC_TableV2));
+	} else {
+		_vlc_huffman_table = &_default_huffman_table;
+	}
+}
+
+void DecDCTvlcCopyTableV3(VLC_TableV3 *addr) {
 	if (addr) {
-		_vlc_huffman_table = addr;
-		memcpy(addr, &_default_huffman_table, sizeof(DECDCTTAB));
+		_vlc_huffman_table = (const VLC_TableV3 *) addr;
+		memcpy(addr, &_default_huffman_table, sizeof(VLC_TableV3));
 	} else {
 		_vlc_huffman_table = &_default_huffman_table;
 	}
diff --git a/libpsn00b/psxpress/vlc.s b/libpsn00b/psxpress/vlc.s
index f3a1c67..2de22f7 100644
--- a/libpsn00b/psxpress/vlc.s
+++ b/libpsn00b/psxpress/vlc.s
@@ -1,375 +1,576 @@
 # PSn00bSDK MDEC library (GTE-accelerated VLC decompressor)
-# (C) 2022 spicyjpeg - MPL licensed
+# (C) 2022-2023 spicyjpeg - MPL licensed
 #
-# Register map:
-# - $a0 = ctx
-# - $a1 = output
-# - $a2 = max_size
-# - $a3 = input
-# - $t0 = window
-# - $t1 = next_window
-# - $t2 = remaining
-# - $t3 = quant_scale
-# - $t4 = is_v3
-# - $t5 = bit_offset
-# - $t6 = block_index
-# - $t7 = coeff_index
-# - $t8 = _vlc_huffman_table
-# - $t9 = &ac_jump_area
+# TODO: reduce the size of the v3 DC coefficient decoder; currently the code is
+# duplicated for each block type, but it can probably be shortened with no
+# performance impact...
 
-.set noreorder
+.include "gtereg.inc"
 
-.set VLC_Context_input,			0
-.set VLC_Context_window,		4
-.set VLC_Context_next_window,	8
-.set VLC_Context_remaining,		12
-.set VLC_Context_quant_scale,	16
-.set VLC_Context_is_v3,			18
-.set VLC_Context_bit_offset,	19
-.set VLC_Context_block_index,	20
-.set VLC_Context_coeff_index,	21
-
-.set DECDCTTAB_lut0,	0
-.set DECDCTTAB_lut2,	4
-.set DECDCTTAB_lut3,	36
-.set DECDCTTAB_lut4,	292
-.set DECDCTTAB_lut5,	308
-.set DECDCTTAB_lut7,	324
-.set DECDCTTAB_lut8,	356
-.set DECDCTTAB_lut9,	420
-.set DECDCTTAB_lut10,	484
-.set DECDCTTAB_lut11,	548
-.set DECDCTTAB_lut12,	612
+.set noreorder
+.set noat
+
+.set value,			$v0
+.set length,		$v1
+.set ctx,			$a0
+.set output,		$a1
+.set max_size,		$a2
+.set input,			$a3
+.set temp,			$t0
+.set window,		$t1
+.set next_window,	$t2
+.set remaining,		$t3
+.set is_v3,			$t4
+.set bit_offset,	$t5
+.set block_index,	$t6
+.set coeff_index,	$t7
+.set quant_scale,	$s0
+.set last_y,		$s1
+.set last_cr,		$s2
+.set last_cb,		$s3
+.set huffman_table,	$t8
+.set ac_jump_area,	$t9
+
+.set VLC_Context_input,			0x0
+.set VLC_Context_window,		0x4
+.set VLC_Context_next_window,	0x8
+.set VLC_Context_remaining,		0xc
+.set VLC_Context_is_v3,			0x10
+.set VLC_Context_bit_offset,	0x11
+.set VLC_Context_block_index,	0x12
+.set VLC_Context_coeff_index,	0x13
+.set VLC_Context_quant_scale,	0x14
+.set VLC_Context_last_y,		0x16
+.set VLC_Context_last_cr,		0x18
+.set VLC_Context_last_cb,		0x1a
+
+.set VLC_Table_ac0,		0x0
+.set VLC_Table_ac2,		0x4
+.set VLC_Table_ac3,		0x24
+.set VLC_Table_ac4,		0x124
+.set VLC_Table_ac5,		0x134
+.set VLC_Table_ac7,		0x144
+.set VLC_Table_ac8,		0x164
+.set VLC_Table_ac9,		0x1a4
+.set VLC_Table_ac10,	0x1e4
+.set VLC_Table_ac11,	0x224
+.set VLC_Table_ac12,	0x264
+.set VLC_Table_dc,		0x2a4
+.set VLC_Table_dc_len,	0x324
 
 .section .text.DecDCTvlcStart
 .global DecDCTvlcStart
 .type DecDCTvlcStart, @function
 DecDCTvlcStart:
+	addiu $sp, -16
+	sw    $s0,  0($sp)
+	sw    $s1,  4($sp)
+	sw    $s2,  8($sp)
+	sw    $s3, 12($sp)
+
 	# Create a new context on-the-fly without writing it to memory then jump
 	# into DecDCTvlcContinue(), skipping context loading.
-	lw    $t0, 8($a3) # window = (bs->data[0] << 16) | (bs->data[0] >> 16)
-	nop
-	srl   $v0, $t0, 16
-	sll   $t0, 16
-
-	lw    $t1, 12($a3) # next_window = (bs->data[1] << 16) | (bs->data[1] >> 16)
-	or    $t0, $v0
-	srl   $v0, $t1, 16
-	sll   $t1, 16
-
-	lhu   $t2, 0($a3) # remaining = bs->uncomp_length * 2
-	or    $t1, $v0
-
-	lhu   $t3, 4($a3) # quant_scale = (bs->quant_scale & 63) << 10
-	sll   $t2, 1
-	andi  $t3, 63
-
-	lhu   $t4, 6($a3) # is_v3 = !(bs->version < 3)
-	sll   $t3, 10
-	sltiu $t4, $t4, 3
-	xori  $t4, 1
-
-	li    $t5, 32 # bit_offset = 32
-	li    $t6, 5 # block_index = 5
-	li    $t7, 0 # coeff_index = 0
+	lw    window, 8(input) # window = (bs->data[0] << 16) | (bs->data[0] >> 16)
+	li    last_y, 0
+	srl   temp, window, 16
+	sll   window, 16
+	or    window, temp
+
+	# next_window = (bs->data[1] << 16) | (bs->data[1] >> 16)
+	lw    next_window, 12(input)
+	li    last_cr, 0
+	srl   temp, next_window, 16
+	sll   next_window, 16
+	or    next_window, temp
+
+	lhu   remaining, 0(input) # remaining = bs->uncomp_length * 2
+	li    last_cb, 0
+	sll   remaining, 1
+
+	lw    temp, 4(input) # quant_scale = (bs->quant_scale & 63) << 10
+	li    bit_offset, 32
+	andi  quant_scale, temp, 63
+	sll   quant_scale, 10
+
+	srl   temp, 16 # is_v3 = !(bs->version < 3)
+	sltiu is_v3, temp, 3
+	xori  is_v3, 1
+
+	li    block_index, 5
+	li    coeff_index, 0
 	j     _vlc_skip_context_load
-	addiu $a3, 16 # input = &(bs->data[2])
+	addiu input, 16 # input = &(bs->data[2])
 
 .section .text.DecDCTvlcContinue
 .global DecDCTvlcContinue
 .type DecDCTvlcContinue, @function
 DecDCTvlcContinue:
-	lw    $a3, VLC_Context_input($a0)
-	lw    $t0, VLC_Context_window($a0)
-	lw    $t1, VLC_Context_next_window($a0)
-	lw    $t2, VLC_Context_remaining($a0)
-	lhu   $t3, VLC_Context_quant_scale($a0)
-	lb    $t4, VLC_Context_is_v3($a0)
-	lb    $t5, VLC_Context_bit_offset($a0)
-	lb    $t6, VLC_Context_block_index($a0)
-	lb    $t7, VLC_Context_coeff_index($a0)
+	addiu $sp, -16
+	sw    $s0,  0($sp)
+	sw    $s1,  4($sp)
+	sw    $s2,  8($sp)
+	sw    $s3, 12($sp)
+
+	lw    input, VLC_Context_input(ctx)
+	lw    window, VLC_Context_window(ctx)
+	lw    next_window, VLC_Context_next_window(ctx)
+	lw    remaining, VLC_Context_remaining(ctx)
+	lb    is_v3, VLC_Context_is_v3(ctx)
+	lb    bit_offset, VLC_Context_bit_offset(ctx)
+	lb    block_index, VLC_Context_block_index(ctx)
+	lb    coeff_index, VLC_Context_coeff_index(ctx)
+	lhu   quant_scale, VLC_Context_quant_scale(ctx)
+	lh    last_y, VLC_Context_last_y(ctx)
+	lh    last_cr, VLC_Context_last_cr(ctx)
+	lh    last_cb, VLC_Context_last_cb(ctx)
 
 _vlc_skip_context_load:
-	# Determine how many bytes to output. This whole block of code basically
-	# does this:
+	# Determine how many bytes to output.
+	#   if (max_size <= 0) max_size = 0x3fff0000
 	#   max_size   = min((max_size - 1) * 2, remaining)
 	#   remaining -= max_size
-	bgtz  $a2, .Lmax_size_valid # if (max_size <= 0) max_size = 0x7ffe0000
-	addiu $a2, -1 # else max_size = (max_size - 1) * 2
-	lui   $a2, 0x3fff
+	bgtz  max_size, .Lmax_size_valid
+	addiu max_size, -1
+	lui   max_size, 0x3fff
 .Lmax_size_valid:
-	sll   $a2, 1
+	sll   max_size, 1
 
-	blt   $a2, $t2, .Lmax_size_ok # if (max_size > remaining) max_size = remaining
-	lui   $v1, 0x3800
-	move  $a2, $t2
-.Lmax_size_ok:
-	subu  $t2, $a2 # remaining -= max_size
+	subu  remaining, max_size
+	bgez  remaining, .Lmax_size_ok
+	lui   temp, 0x3800
 
+	addu  max_size, remaining
+	li    remaining, 0
+
+.Lmax_size_ok:
 	# Write the length of the data that will be decoded to first 4 bytes of the
 	# output buffer, which will be then parsed by DecDCTin().
-	srl   $v0, $a2, 1 # output[0] = 0x38000000 | (max_size / 2)
-	or    $v0, $v1
-	sw    $v0, 0($a1)
+	srl   value, max_size, 1 # output[0] = 0x38000000 | (max_size / 2)
+	or    value, temp
+	sw    value, 0(output)
 
 	# Obtain the addresses of the lookup table and jump area in advance so that
 	# they don't have to be retrieved for each coefficient decoded.
-	lw    $t8, _vlc_huffman_table
-	la    $t9, .Lac_prefix_10
+	lw    huffman_table, _vlc_huffman_table
+	la    ac_jump_area, .Lac_prefix_01 - 32
 
-	beqz  $a2, .Lstop_processing
-	addiu $a1, 4 # output = (uint16_t *) &output[1]
+	beqz  max_size, .Lstop_processing
+	addiu output, 4
 
 .Lprocess_next_code_loop: # while (max_size)
 	# This is the "hot" part of the decoder, executed for each code in the
 	# bitstream. The first step is to determine if the next code is a DC or AC
-	# coefficient.
-	bnez  $t7, .Lprocess_ac_coefficient
-	addiu $t7, 1 # coeff_index++
-	bnez  $t4, .Lprocess_dc_v3_coefficient
-	li    $v1, 0x01ff
+	# coefficient; at the same time the GTE is given the task of counting the
+	# number of leading zeroes/ones in the code (which takes 2 more cycles).
+	mtc2  window, C2_LZCS
+
+	bnez  coeff_index, .Lprocess_ac_coefficient
+	addiu coeff_index, 1
+	bnez  is_v3, .Lprocess_dc_v3_coefficient
+	li    temp, 0x1ff
 
 .Lprocess_dc_v2_coefficient: # if (!coeff_index && !is_v3)
 	# The DC coefficient in version 2 frames is not compressed. Value 0x1ff is
 	# used to signal the end of the bitstream.
-	srl   $v0, $t0, 22 # prefix = (window >> (32 - 10))
-	beq   $v0, $v1, .Lstop_processing # if (prefix == 0x1ff) break
-	or    $v0, $t3 # *output = prefix | quant_scale
-	sll   $t0, 10 # window <<= 10
-	b     .Lwrite_value
-	addiu $t5, -10 # bit_offset -= 10
+	#   prefix = window >> (32 - 10)
+	#   if (prefix == 0x1ff) break
+	#   *output = prefix | quant_scale
+	srl   value, window, 22
+	beq   value, temp, .Lstop_processing
+	or    value, quant_scale
+	sll   window, 10
+	addiu bit_offset, -10
+
+	b     .Lfeed_bitstream
+	sh    value, 0(output)
 
 .Lprocess_dc_v3_coefficient: # if (!coeff_index && is_v3)
-	# TODO: version 3 is currently not supported.
-	jr    $ra
-	li    $v0, -1
-
-.Lprocess_ac_coefficient: # if (coeff_index)
-	# Check whether the prefix code is one of the shorter, more common ones,
-	# and start counting the number of leading zeroes/ones using the GTE (which
-	# takes 2 more cycles).
-	srl   $v0, $t0, 30
-	li    $v1, 3
-	beq   $v0, $v1, .Lac_prefix_11
-	li    $v1, 2
-	beq   $v0, $v1, .Lac_prefix_10
-	li    $v1, 1
-	mtc2  $t0, $30
-	beq   $v0, $v1, .Lac_prefix_01
+	# Version 3 DC coefficients are variable-length deltas, prefixed with a
+	# Huffman code indicating their length. Since the prefix code is up to 7
+	# bits long, it makes sense to decode it with a simple 128-byte lookup
+	# table rather than using the GTE. The codes are different for luma and
+	# chroma blocks, so each table entry contains the decoded length for both
+	# block types (packed as two nibbles). Prefix 111111111 is used to signal
+	# the end of the bitstream.
+	#   prefix = window >> (32 - 9)
+	#   if (prefix == 0x1ff) break
+	#   lengths = huffman_table->dc[prefix >> 2]
+	srl   length, window, 23
+	beq   length, temp, .Lstop_processing
+	srl   length, 2
+	addu  length, huffman_table
+
+	addiu $at, block_index, -4
+	bltz  $at, .Ldc_block_y
+	lbu   length, VLC_Table_dc(length)
+	beqz  $at, .Ldc_block_cb
+	andi  length, 15 # if (block_index >= Cb) dc_length = lengths & 15
+
+.Ldc_block_cr: # if (block_index > Cb)
+	# prefix_length = huffman_table->dc_len[dc_length] & 15
+	addu  temp, length, huffman_table
+	lbu   temp, VLC_Table_dc_len(temp)
+	li    $at, 32
+	andi  temp, 15
+
+	sllv  window, window, temp
+	beqz  length, .Ldc_cr_zero # if (dc_length)
+	subu  bit_offset, temp
+
+	subu  $at, length # value = window >> (32 - dc_length)
+	srlv  value, window, $at
+
+	# Decode the sign bit, then add the decoded delta to the current value.
+	#   if (!(window >> 31)) value -= (1 << dc_length) - 1
+	bltz  window, .Ldc_cr_positive
+	li    temp, -1
+	srlv  temp, temp, $at
+	subu  value, temp
+.Ldc_cr_positive:
+	addu  last_cr, value
+	andi  last_cr, 0x3ff
+
+.Ldc_cr_zero:
+	sll   temp, last_cr, 2 # *output = (last_cr << 2) | quant_scale
+	or    temp, quant_scale
+	b     .Lupdate_window_dc # update_window(dc_length)
+	sh    temp, 0(output)
+
+.Ldc_block_cb: # if (block_index == Cb)
+	# prefix_length = huffman_table->dc_len[dc_length] & 15
+	addu  temp, length, huffman_table
+	lbu   temp, VLC_Table_dc_len(temp)
+	li    $at, 32
+	andi  temp, 15
+
+	sllv  window, window, temp
+	beqz  length, .Ldc_cb_zero # if (dc_length)
+	subu  bit_offset, temp
+
+	subu  $at, length # value = window >> (32 - dc_length)
+	srlv  value, window, $at
+
+	# Decode the sign bit, then add the decoded delta to the current value.
+	#   if (!(window >> 31)) value -= (1 << dc_length) - 1
+	bltz  window, .Ldc_cb_positive
+	li    temp, -1
+	srlv  temp, temp, $at
+	subu  value, temp
+.Ldc_cb_positive:
+	addu  last_cb, value
+	andi  last_cb, 0x3ff
+
+.Ldc_cb_zero:
+	sll   value, last_cb, 2 # *output = (last_cb << 2) | quant_scale
+	or    value, quant_scale
+	b     .Lupdate_window_dc # update_window(dc_length)
+	sh    value, 0(output)
+
+.Ldc_block_y: # if (block_index < Cb)
 	nop
+	srl   length, 4 # dc_length = lengths >> 4
+
+	# prefix_length = huffman_table->dc_len[dc_length] >> 4
+	addu  temp, length, huffman_table
+	lbu   temp, VLC_Table_dc_len(temp)
+	li    $at, 32
+	srl   temp, 4
+
+	sllv  window, window, temp
+	beqz  length, .Ldc_y_zero # if (dc_length)
+	subu  bit_offset, temp
+
+	sll   temp, last_y, 2
+	subu  $at, length # value = window >> (32 - dc_length)
+	srlv  value, window, $at
+
+	# Decode the sign bit, then add the decoded delta to the current value.
+	#   if (!(window >> 31)) value -= (1 << dc_length) - 1
+	bltz  window, .Ldc_y_positive
+	li    temp, -1
+	srlv  temp, temp, $at
+	subu  value, temp
+.Ldc_y_positive:
+	addu  last_y, value
+	andi  last_y, 0x3ff
+
+.Ldc_y_zero:
+	sll   temp, last_y, 2 # *output = (last_y << 2) | quant_scale
+	or    temp, quant_scale
+	b     .Lupdate_window_dc # update_window(dc_length)
+	sh    temp, 0(output)
 
-	# If the code is longer, retrieve the number of leading zeroes from the GTE
-	# and use it as an index into the jump area. Each block in the area is 8
-	# instructions long and handles decoding a specific prefix.
-	mfc2  $v0, $31
-	li    $v1, 11
-	bgt   $v0, $v1, .Lreturn_error # if (prefix > 11) return -1
-	sll   $v0, 5 # jump_addr = &ac_jump_area[prefix * 8 * sizeof(u32)]
-	addu  $v0, $t9
-	jr    $v0
+.Lprocess_ac_coefficient: # if (coeff_index)
+	# Check whether the prefix code is 10 or 11 (i.e. if it starts with 1). If
+	# not, retrieve the number of leading zeroes from the GTE and use it as an
+	# index into the jump area. Each block in the area is 8 instructions long
+	# and handles decoding a specific prefix.
+	mfc2  temp, C2_LZCR
+
+	bltz  window, .Lac_prefix_1 # if (!(window >> 31))
+	addiu $at, temp, -11 # if (prefix > 11) return -1
+	bgtz  $at, .Lreturn_error
+	sll   temp, 5 # jump_addr = &ac_jump_area[prefix * 8 * sizeof(uint32_t)]
+	addu  temp, ac_jump_area
+	jr    temp
 	nop
 
 .Lreturn_error:
-	jr    $ra
+	b     .Lreturn
 	li    $v0, -1
 
-.Lac_prefix_11:
-	# Prefix 11 is followed by a single bit.
-	srl   $v0, $t0, 28 # index = ((window >> (32 - 2 - 1)) & 1) * sizeof(u16)
-	andi  $v0, 2
-	addu  $v0, $t8 # value = table->lut0[index]
-	lhu   $v0, DECDCTTAB_lut0($v0)
-	sll   $t0, 3 # window <<= 3
-	b     .Lwrite_value
-	addiu $t5, -3 # bit_offset -= 3
-	#.word 0
+.Lac_prefix_1: # if (window >> 31)
+	sll   window, 1
+	bltz  window, .Lac_prefix_11
+	li    temp, 0xfe00
 
 .Lac_prefix_10:
 	# Prefix 10 marks the end of a block.
-	li    $v0, 0xfe00 # value = 0xfe00
-	sll   $t0, 2 # window <<= 2
-	addiu $t5, -2 # bit_offset -= 2
-	addiu $t6, -1 # block_index--
-	bgez  $t6, .Lwrite_value
-	li    $t7, 0 # coeff_index = 0
-	b     .Lwrite_value
-	li    $t6, 5 # if (block_index < 0) block_index = 5
+	#   *output = 0xfe00
+	#   coeff_index = 0
+	#   if (--block_index < Y3) block_index = Cr
+	sll   window, 1
+	addiu bit_offset, -2
+	sh    temp, 0(output)
+
+	addiu block_index, -1
+	bgez  block_index, .Lfeed_bitstream
+	li    coeff_index, 0
+	b     .Lfeed_bitstream
+	li    block_index, 5
+
+.Lac_prefix_11:
+	# Prefix 11 is followed by a single bit. Note that the 10/11 prefix check
+	# already shifts the window by one bit (without updating the bit offset).
+	#   index   = ((window >> (32 - 1 - 1)) & 1) * sizeof(uint16_t)
+	#   *output = huffman_table->ac0[index]
+	srl   value, window, 29
+	andi  value, 2
+	addu  value, huffman_table
+	lhu   value, VLC_Table_ac0(value)
+	sll   window, 2
+	addiu bit_offset, -3
+
+	b     .Lfeed_bitstream
+	sh    value, 0(output)
 
 .Lac_prefix_01:
 	# Prefix 01 can be followed by a 2-bit lookup index starting with 1, or a
 	# 3-bit lookup index starting with 0. A 32-bit lookup table is used,
 	# containing both MDEC codes and lengths.
-	srl   $v0, $t0, 25 # index = ((window >> (32 - 2 - 3)) & 7) * sizeof(u32)
-	andi  $v0, 28
-	addu  $v0, $t8 # value = table->lut2[index]
-	lw    $v0, DECDCTTAB_lut2($v0)
-	b     .Lupdate_window_and_write
-	srl   $v1, $v0, 16 # length = value >> 16
+	#   index   = ((window >> (32 - 2 - 3)) & 7) * sizeof(uint32_t)
+	#   *output = huffman_table->ac2[index] & 0xffff
+	#   length  = huffman_table->ac2[index] >> 16
+	srl   value, window, 25
+	andi  value, 28
+	addu  value, huffman_table
+	lw    value, VLC_Table_ac2(value)
+
+	b     .Lupdate_window_ac # update_window(value >> 16)
+	sh    value, 0(output)
 	.word 0, 0
 
 .Lac_prefix_001:
 	# Prefix 001 can be followed by a 6-bit lookup index starting with 00, or a
 	# 3-bit lookup index starting with 01/10/11.
-	srl   $v0, $t0, 21 # index = ((window >> (32 - 3 - 6)) & 63) * sizeof(u32)
-	andi  $v0, 252
-	addu  $v0, $t8 # value = table->lut3[index]
-	lw    $v0, DECDCTTAB_lut3($v0)
-	b     .Lupdate_window_and_write
-	srl   $v1, $v0, 16 # length = value >> 16
+	#   index   = ((window >> (32 - 3 - 6)) & 63) * sizeof(uint32_t)
+	#   *output = huffman_table->ac3[index] & 0xffff
+	#   length  = huffman_table->ac3[index] >> 16
+	srl   value, window, 21
+	andi  value, 252
+	addu  value, huffman_table
+	lw    value, VLC_Table_ac3(value)
+
+	b     .Lupdate_window_ac # update_window(value >> 16)
+	sh    value, 0(output)
 	.word 0, 0
 
 .Lac_prefix_0001:
 	# Prefix 0001 is followed by a 3-bit lookup index.
-	srl   $v0, $t0, 24 # index = ((window >> (32 - 4 - 3)) & 7) * sizeof(u16)
-	andi  $v0, 14
-	addu  $v0, $t8 # value = table->lut4[index]
-	lhu   $v0, DECDCTTAB_lut4($v0)
-	sll   $t0, 7 # window <<= 4 + 3
-	b     .Lwrite_value
-	addiu $t5, -7 # bit_offset -= 4 + 3
-	.word 0
+	#   index   = ((window >> (32 - 4 - 3)) & 7) * sizeof(uint16_t)
+	#   *output = huffman_table->ac4[index]
+	srl   value, window, 24
+	andi  value, 14
+	addu  value, huffman_table
+	lhu   value, VLC_Table_ac4(value)
+	sll   window, 7
+	addiu bit_offset, -7
+
+	b     .Lfeed_bitstream
+	sh    value, 0(output)
 
 .Lac_prefix_00001:
 	# Prefix 00001 is followed by a 3-bit lookup index.
-	srl   $v0, $t0, 23 # index = ((window >> (32 - 5 - 3)) & 7) * sizeof(u16)
-	andi  $v0, 14
-	addu  $v0, $t8 # value = table->lut5[index]
-	lhu   $v0, DECDCTTAB_lut5($v0)
-	sll   $t0, 8 # window <<= 5 + 3
-	b     .Lwrite_value
-	addiu $t5, -8 # bit_offset -= 5 + 3
-	.word 0
+	#   index   = ((window >> (32 - 5 - 3)) & 7) * sizeof(uint16_t)
+	#   *output = huffman_table->ac5[index]
+	srl   value, window, 23
+	andi  value, 14
+	addu  value, huffman_table
+	lhu   value, VLC_Table_ac5(value)
+	sll   window, 8
+	addiu bit_offset, -8
+
+	b     .Lfeed_bitstream
+	sh    value, 0(output)
 
 .Lac_prefix_000001:
 	# Prefix 000001 is an escape code followed by a full 16-bit MDEC value.
-	srl   $v0, $t0, 10 # value = window >> (32 - 6 - 16)
-	sll   $t0, 22 # window <<= 6 + 16
-	b     .Lwrite_value
-	addiu $t5, -22 # bit_offset -= 6 + 16
-	.word 0, 0, 0, 0
+	#   *output = window >> (32 - 6 - 16)
+	srl   value, window, 10
+	sll   window, 22
+	addiu bit_offset, -22
+
+	b     .Lfeed_bitstream
+	sh    value, 0(output)
+	.word 0, 0, 0
 
 .Lac_prefix_0000001:
 	# Prefix 0000001 is followed by a 4-bit lookup index.
-	srl   $v0, $t0, 20 # index = ((window >> (32 - 7 - 4)) & 15) * sizeof(u16)
-	andi  $v0, 30
-	addu  $v0, $t8 # value = table->lut7[index]
-	lhu   $v0, DECDCTTAB_lut7($v0)
-	sll   $t0, 11 # window <<= 7 + 4
-	b     .Lwrite_value
-	addiu $t5, -11 # bit_offset -= 7 + 4
-	.word 0
+	#   index   = ((window >> (32 - 7 - 4)) & 15) * sizeof(uint16_t)
+	#   *output = huffman_table->ac7[index]
+	srl   value, window, 20
+	andi  value, 30
+	addu  value, huffman_table
+	lhu   value, VLC_Table_ac7(value)
+	sll   window, 11
+	addiu bit_offset, -11
+
+	b     .Lfeed_bitstream
+	sh    value, 0(output)
 
 .Lac_prefix_00000001:
 	# Prefix 00000001 is followed by a 5-bit lookup index.
-	srl   $v0, $t0, 18 # index = ((window >> (32 - 8 - 5)) & 31) * sizeof(u16)
-	andi  $v0, 62
-	addu  $v0, $t8 # value = table->lut8[index]
-	lhu   $v0, DECDCTTAB_lut8($v0)
-	sll   $t0, 13 # window <<= 8 + 5
-	b     .Lwrite_value
-	addiu $t5, -13 # bit_offset -= 8 + 5
-	.word 0
+	#   index   = ((window >> (32 - 8 - 5)) & 31) * sizeof(uint16_t)
+	#   *output = huffman_table->ac8[index]
+	srl   value, window, 18
+	andi  value, 62
+	addu  value, huffman_table
+	lhu   value, VLC_Table_ac8(value)
+	sll   window, 13
+	addiu bit_offset, -13
+
+	b     .Lfeed_bitstream
+	sh    value, 0(output)
 
 .Lac_prefix_000000001:
 	# Prefix 000000001 is followed by a 5-bit lookup index.
-	srl   $v0, $t0, 17 # index = ((window >> (32 - 9 - 5)) & 31) * sizeof(u16)
-	andi  $v0, 62
-	addu  $v0, $t8 # value = table->lut9[index]
-	lhu   $v0, DECDCTTAB_lut9($v0)
-	sll   $t0, 14 # window <<= 9 + 5
-	b     .Lwrite_value
-	addiu $t5, -14 # bit_offset -= 9 + 5
-	.word 0
+	#   index   = ((window >> (32 - 9 - 5)) & 31) * sizeof(uint16_t)
+	#   *output = huffman_table->ac9[index]
+	srl   value, window, 17
+	andi  value, 62
+	addu  value, huffman_table
+	lhu   value, VLC_Table_ac9(value)
+	sll   window, 14
+	addiu bit_offset, -14
+
+	b     .Lfeed_bitstream
+	sh    value, 0(output)
 
 .Lac_prefix_0000000001:
 	# Prefix 0000000001 is followed by a 5-bit lookup index.
-	srl   $v0, $t0, 16 # index = ((window >> (32 - 10 - 5)) & 31) * sizeof(u16)
-	andi  $v0, 62
-	addu  $v0, $t8 # value = table->lut10[index]
-	lhu   $v0, DECDCTTAB_lut10($v0)
-	sll   $t0, 15 # window <<= 10 + 5
-	b     .Lwrite_value
-	addiu $t5, -15 # bit_offset -= 10 + 5
-	.word 0
+	#   index   = ((window >> (32 - 10 - 5)) & 31) * sizeof(uint16_t)
+	#   *output = huffman_table->ac10[index]
+	srl   value, window, 16
+	andi  value, 62
+	addu  value, huffman_table
+	lhu   value, VLC_Table_ac10(value)
+	sll   window, 15
+	addiu bit_offset, -15
+
+	b     .Lfeed_bitstream
+	sh    value, 0(output)
 
 .Lac_prefix_00000000001:
 	# Prefix 00000000001 is followed by a 5-bit lookup index.
-	srl   $v0, $t0, 15 # index = ((window >> (32 - 11 - 5)) & 31) * sizeof(u16)
-	andi  $v0, 62
-	addu  $v0, $t8 # value = table->lut11[index]
-	lhu   $v0, DECDCTTAB_lut11($v0)
-	sll   $t0, 16 # window <<= 11 + 5
-	b     .Lwrite_value
-	addiu $t5, -16 # bit_offset -= 11 + 5
-	.word 0
+	#   index   = ((window >> (32 - 11 - 5)) & 31) * sizeof(uint16_t)
+	#   *output = huffman_table->ac11[index]
+	srl   value, window, 15
+	andi  value, 62
+	addu  value, huffman_table
+	lhu   value, VLC_Table_ac11(value)
+	sll   window, 16
+	addiu bit_offset, -16
+
+	b     .Lfeed_bitstream
+	sh    value, 0(output)
 
 .Lac_prefix_000000000001:
 	# Prefix 000000000001 is followed by a 5-bit lookup index.
-	srl   $v0, $t0, 14 # index = ((window >> (32 - 12 - 5)) & 31) * sizeof(u16)
-	andi  $v0, 62
-	addu  $v0, $t8 # value = table->lut12[index]
-	lhu   $v0, DECDCTTAB_lut12($v0)
-	sll   $t0, 17 # window <<= 12 + 5
-	b     .Lwrite_value
-	addiu $t5, -17 # bit_offset -= 12 + 5
-	.word 0
-
-.Lupdate_window_and_write:
-	sllv  $t0, $t0, $v1 # window <<= length
-	subu  $t5, $v1 # bit_offset -= length
-.Lwrite_value:
-	sh    $v0, 0($a1)
+	#   index   = ((window >> (32 - 12 - 5)) & 31) * sizeof(uint16_t)
+	#   *output = huffman_table->ac12[index]
+	srl   value, window, 14
+	andi  value, 62
+	addu  value, huffman_table
+	lhu   value, VLC_Table_ac12(value)
+	sll   window, 17
+	addiu bit_offset, -17
+
+	b     .Lfeed_bitstream
+	sh    value, 0(output)
+
+.Lupdate_window_ac:
+	srl   length, value, 16
+.Lupdate_window_dc:
+	sllv  window, window, length
+	subu  bit_offset, length
+
 .Lfeed_bitstream:
 	# Update the window. This makes sure the next iteration of the loop will be
 	# able to read up to 32 bits from the bitstream.
-	bgez  $t5, .Lskip_feeding # if (bit_offset < 0)
-	addiu $a2, -1 # max_size--
-
-	subu  $v0, $0, $t5 # window = next_window << (-bit_offset)
-	sllv  $t0, $t1, $v0
-	lw    $t1, 0($a3) # next_window = (*input << 16) | (*input >> 16)
-	addiu $t5, 32 # bit_offset += 32
-	srl   $v0, $t1, 16
-	sll   $t1, 16
-	or    $t1, $v0
-	addiu $a3, 4 # input++
+	bgez  bit_offset, .Lskip_feeding # if (bit_offset < 0)
+	addiu max_size, -1
+
+	subu  temp, $0, bit_offset # window = next_window << (-bit_offset)
+	sllv  window, next_window, temp
+	lw    next_window, 0(input) # next_window = (*input << 16) | (*input >> 16)
+	addiu bit_offset, 32
+	srl   temp, next_window, 16
+	sll   next_window, 16
+	or    next_window, temp
+	addiu input, 4
 
 .Lskip_feeding:
-	srlv  $v0, $t1, $t5 # window |= next_window >> bit_offset
-	or    $t0, $v0
+	srlv  temp, next_window, bit_offset # window |= next_window >> bit_offset
+	or    window, temp
 
-	bnez  $a2, .Lprocess_next_code_loop
-	addiu $a1, 2 # output++
+	bnez  max_size, .Lprocess_next_code_loop
+	addiu output, 2
 
 .Lstop_processing:
 	# If remaining = 0, skip flushing the context, pad the output buffer with
 	# end-of-block codes if necessary and return 0. Otherwise flush the context
 	# and return 1.
-	beqz  $t2, .Lpad_output_buffer
-	nop
-
-	sw    $a3, VLC_Context_input($a0)
-	sw    $t0, VLC_Context_window($a0)
-	sw    $t1, VLC_Context_next_window($a0)
-	sw    $t2, VLC_Context_remaining($a0)
-	sh    $t3, VLC_Context_quant_scale($a0)
-	sb    $t4, VLC_Context_is_v3($a0)
-	sb    $t5, VLC_Context_bit_offset($a0)
-	sb    $t6, VLC_Context_block_index($a0)
-	sb    $t7, VLC_Context_coeff_index($a0)
-
-	jr    $ra
+	beqz  remaining, .Lpad_output_buffer
+	li    temp, 0xfe00
+
+	sw    input, VLC_Context_input(ctx)
+	sw    window, VLC_Context_window(ctx)
+	sw    next_window, VLC_Context_next_window(ctx)
+	sw    remaining, VLC_Context_remaining(ctx)
+	sb    bit_offset, VLC_Context_bit_offset(ctx)
+	sb    block_index, VLC_Context_block_index(ctx)
+	sb    coeff_index, VLC_Context_coeff_index(ctx)
+	sh    last_y, VLC_Context_last_y(ctx)
+	sh    last_cr, VLC_Context_last_cr(ctx)
+	sh    last_cb, VLC_Context_last_cb(ctx)
+
+	b     .Lreturn
 	li    $v0, 1
 
 .Lpad_output_buffer:
-	beqz  $a2, .Lreturn_zero
-	li    $v0, 0xfe00
-.Lpad_output_buffer_loop: # while (max_size)
-	sh    $v0, 0($a1) # *output = 0xfe00
-	addiu $a2, -1 # max_size--
-	bnez  $a2, .Lpad_output_buffer_loop
-	addiu $a1, 2 # output++
+	beqz  max_size, .Lreturn
+	li    $v0, 0
 
-.Lreturn_zero:
+.Lpad_output_buffer_loop: # while (max_size)
+	sh    temp, 0(output)
+	addiu max_size, -1
+	bnez  max_size, .Lpad_output_buffer_loop
+	addiu output, 2
+
+.Lreturn:
+	lw    $s0,  0($sp)
+	lw    $s1,  4($sp)
+	lw    $s2,  8($sp)
+	lw    $s3, 12($sp)
 	jr    $ra
-	li    $v0, 0
+	addiu $sp, 16
diff --git a/libpsn00b/psxpress/vlc2.c b/libpsn00b/psxpress/vlc2.c
index 9eb99bf..24c54ce 100644
--- a/libpsn00b/psxpress/vlc2.c
+++ b/libpsn00b/psxpress/vlc2.c
@@ -63,7 +63,7 @@ static const uint32_t _compressed_table[TABLE_LENGTH] = {
 static VLC_Context	_default_context;
 static size_t		_max_buffer_size = 0;
 
-const DECDCTTAB2	*_vlc_huffman_table2 = 0;
+const DECDCTTAB *_vlc_huffman_table2 = 0;
 
 /* VLC decoder */
 
@@ -77,14 +77,17 @@ int __attribute__((optimize(3))) DecDCTvlcContinue2(
 	VLC_Context *ctx, uint32_t *buf, size_t max_size
 ) {
 	const uint32_t	*input		= ctx->input;
-	uint32_t		remaining	= ctx->remaining;
 	uint32_t		window		= ctx->window;
 	uint32_t		next_window	= ctx->next_window;
-	uint16_t		quant_scale	= ctx->quant_scale;
+	uint32_t		remaining	= ctx->remaining;
+	int				is_v3		= ctx->is_v3;
+	int				bit_offset	= ctx->bit_offset;
 	int				block_index	= ctx->block_index;
 	int				coeff_index	= ctx->coeff_index;
-	int				bit_offset	= ctx->bit_offset;
-	int				is_v3		= ctx->is_v3;
+	uint16_t		quant_scale	= ctx->quant_scale;
+	int16_t			last_y		= ctx->last_y;
+	int16_t			last_cr		= ctx->last_cr;
+	int16_t			last_cb		= ctx->last_cb;
 
 	//if (!_vlc_huffman_table2)
 		//return -1;
@@ -122,13 +125,13 @@ int __attribute__((optimize(3))) DecDCTvlcContinue2(
 			} else if (window >> 24) {
 				// The first lookup table is for codes that not start with
 				// 00000000.
-				value = _vlc_huffman_table2->lut[_get_bits_unsigned(13)];
+				value = _vlc_huffman_table2->ac[_get_bits_unsigned(13)];
 				_advance_window(value >> 16);
 				*output = (uint16_t) value;
 			} else {
 				// If the code starts with 00000000, use the second lookup
 				// table.
-				value = _vlc_huffman_table2->lut00[_get_bits_unsigned(17)];
+				value = _vlc_huffman_table2->ac00[_get_bits_unsigned(17)];
 				_advance_window(value >> 16);
 				*output = (uint16_t) value;
 			}
@@ -176,12 +179,15 @@ int __attribute__((optimize(3))) DecDCTvlcContinue2(
 		return 0;
 
 	ctx->input			= input;
-	ctx->remaining		= remaining;
 	ctx->window			= window;
 	ctx->next_window	= next_window;
+	ctx->remaining		= remaining;
+	ctx->bit_offset		= bit_offset;
 	ctx->block_index	= block_index;
 	ctx->coeff_index	= coeff_index;
-	ctx->bit_offset		= bit_offset;
+	ctx->last_y			= last_y;
+	ctx->last_cr		= last_cr;
+	ctx->last_cb		= last_cb;
 	return 1;
 }
 
@@ -197,21 +203,24 @@ int DecDCTvlcStart2(
 		return -1;
 
 	ctx->input			= &input[2];
-	ctx->remaining		= (header->mdec0_header & 0xffff) * 2;
 	ctx->window			= (input[0] << 16) | (input[0] >> 16);
 	ctx->next_window	= (input[1] << 16) | (input[1] >> 16);
-	ctx->quant_scale	= (header->quant_scale & 63) << 10;
+	ctx->remaining		= (header->mdec0_header & 0xffff) * 2;
+	ctx->is_v3			= (header->version >= 3);
+	ctx->bit_offset		= 32;
 	ctx->block_index	= 0;
 	ctx->coeff_index	= 0;
-	ctx->bit_offset		= 32;
-	ctx->is_v3			= (header->version == 3);
+	ctx->quant_scale	= (header->quant_scale & 63) << 10;
+	ctx->last_y			= 0;
+	ctx->last_cr		= 0;
+	ctx->last_cb		= 0;
 
 	return DecDCTvlcContinue2(ctx, buf, max_size);
 }
 
 /* Stateful VLC decoder API (for Sony SDK compatibility) */
 
-int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB2 *table) {
+int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB *table) {
 	if (table)
 		_vlc_huffman_table2 = table;
 
@@ -230,7 +239,7 @@ size_t DecDCTvlcSize2(size_t size) {
 
 /* Lookup table decompressor */
 
-void DecDCTvlcBuild(DECDCTTAB2 *table) {
+void DecDCTvlcBuild(DECDCTTAB *table) {
 	uint32_t *output    = (uint32_t *) table;
 	_vlc_huffman_table2 = table;
 
diff --git a/libpsn00b/psxspu/common.c b/libpsn00b/psxspu/common.c
index 6ccbef4..1275621 100644
--- a/libpsn00b/psxspu/common.c
+++ b/libpsn00b/psxspu/common.c
@@ -1,6 +1,6 @@
 /*
  * PSn00bSDK SPU library (common functions)
- * (C) 2022 spicyjpeg - MPL licensed
+ * (C) 2022-2023 spicyjpeg - MPL licensed
  */
 
 #include <stdint.h>
@@ -32,7 +32,7 @@ static void _wait_status(uint16_t mask, uint16_t value) {
 			return;
 	}
 
-	_sdk_log("status register timeout (0x%04x)\n", SPU_STAT);
+	_sdk_log("timeout, status=0x%04x\n", SPU_STAT);
 }
 
 static size_t _dma_transfer(uint32_t *data, size_t length, int write) {
author	spicyjpeg <thatspicyjpeg@gmail.com>	2023-01-23 09:36:22 +0100
committer	spicyjpeg <thatspicyjpeg@gmail.com>	2023-01-23 09:36:22 +0100
commit	09f321e37fc187affa664d32e36e32c0533a7e8e (patch)
tree	27f846c194d92a9f4f8e3daea4ff2317e3e66894
parent	a21e949c9aea98cb4b3feee48bb98579bbdfba70 (diff)
download	psn00bsdk-09f321e37fc187affa664d32e36e32c0533a7e8e.tar.gz