aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorspicyjpeg <thatspicyjpeg@gmail.com>2023-01-23 09:36:22 +0100
committerspicyjpeg <thatspicyjpeg@gmail.com>2023-01-23 09:36:22 +0100
commit09f321e37fc187affa664d32e36e32c0533a7e8e (patch)
tree27f846c194d92a9f4f8e3daea4ff2317e3e66894
parenta21e949c9aea98cb4b3feee48bb98579bbdfba70 (diff)
Add BS v3 decoding, fix MDEC API and strvideo example
-rw-r--r--examples/mdec/strvideo/main.c74
-rw-r--r--libpsn00b/include/psxpress.h126
-rw-r--r--libpsn00b/psxpress/README.md29
-rw-r--r--libpsn00b/psxpress/mdec.c37
-rw-r--r--libpsn00b/psxpress/vlc.c160
-rw-r--r--libpsn00b/psxpress/vlc.s743
-rw-r--r--libpsn00b/psxpress/vlc2.c39
-rw-r--r--libpsn00b/psxspu/common.c4
8 files changed, 763 insertions, 449 deletions
diff --git a/examples/mdec/strvideo/main.c b/examples/mdec/strvideo/main.c
index 28d39b2..57cb6ef 100644
--- a/examples/mdec/strvideo/main.c
+++ b/examples/mdec/strvideo/main.c
@@ -1,6 +1,6 @@
/*
* PSn00bSDK .STR FMV playback example
- * (C) 2022 spicyjpeg - MPL licensed
+ * (C) 2022-2023 spicyjpeg - MPL licensed
*
* This example demonstrates playback of full-motion video in the standard .STR
* format, using the MDEC for frame decoding and XA for audio. Decoded frames
@@ -34,9 +34,10 @@
* Playback is stopped once the .STR header is no longer present in sectors
* read.
*
- * Note that PSn00bSDK's bitstream decoding API only supports version 1 and 2
- * bitstreams currently, so make sure your .STR files are encoded as v2 and not
- * v3.
+ * PSn00bSDK's bitstream decoding API supports both version 2 and 3 bitstreams.
+ * Encoding your .STR files as v3 may result in slightly higher quality
+ * depending on the encoder, but also higher CPU usage during playback compared
+ * to the older v2.
*/
#include <stdint.h>
@@ -102,13 +103,12 @@ void init_context(RenderContext *ctx) {
FntOpen(4, 12, 312, 16, 2, 256);
}
-void display(RenderContext *ctx, int sync) {
+void display(RenderContext *ctx) {
Framebuffer *db;
ctx->db_active ^= 1;
DrawSync(0);
- if (sync)
- VSync(0);
+ //VSync(0);
db = &(ctx->db[ctx->db_active]);
PutDrawEnv(&(db->draw));
@@ -163,13 +163,13 @@ typedef struct {
volatile int8_t cur_frame, cur_slice;
} StreamContext;
-StreamContext str_ctx;
+static StreamContext str_ctx;
// This buffer is used by cd_sector_handler() as a temporary area for sectors
// read from the CD. Due to DMA limitations it can't be allocated on the stack
// (especially not in the interrupt callbacks' stack, whose size is very
// limited).
-STR_Header sector_header;
+static STR_Header sector_header;
void cd_sector_handler(void) {
StreamBuffer *frame = &str_ctx.frames[str_ctx.cur_frame];
@@ -268,7 +268,7 @@ void init_stream(void) {
// optional but makes the decompressor slightly faster. See the libpsxpress
// documentation for more details.
DecDCTvlcSize(0x8000);
- DecDCTvlcCopyTable((DECDCTTAB *) 0x1f800000);
+ DecDCTvlcCopyTableV3((VLC_TableV3 *) 0x1f800000);
str_ctx.cur_frame = 0;
str_ctx.cur_slice = 0;
@@ -309,7 +309,7 @@ void start_stream(CdlFILE *file) {
static RenderContext ctx;
-#define SHOW_STATUS(...) { FntPrint(-1, __VA_ARGS__); FntFlush(-1); display(&ctx, 1); }
+#define SHOW_STATUS(...) { FntPrint(-1, __VA_ARGS__); FntFlush(-1); display(&ctx); }
#define SHOW_ERROR(...) { SHOW_STATUS(__VA_ARGS__); while (1) __asm__("nop"); }
int main(int argc, const char* argv[]) {
@@ -318,7 +318,7 @@ int main(int argc, const char* argv[]) {
SHOW_STATUS("INITIALIZING\n");
SpuInit();
CdInit();
- InitGeom(); // Required for PSn00bSDK's DecDCTvlc()
+ InitGeom(); // GTE initialization required by the VLC decompressor
DecDCTReset(0);
SHOW_STATUS("OPENING VIDEO FILE\n");
@@ -330,8 +330,9 @@ int main(int argc, const char* argv[]) {
init_stream();
start_stream(&file);
- // Disable framebuffer clearing to get rid of flickering during playback.
- display(&ctx, 1);
+ // Clear the screen, then disable framebuffer clearing to get rid of
+ // flickering during playback.
+ display(&ctx);
ctx.db[0].draw.isbg = 0;
ctx.db[1].draw.isbg = 0;
#ifdef DISP_24BPP
@@ -339,9 +340,13 @@ int main(int argc, const char* argv[]) {
ctx.db[1].disp.isrgb24 = 1;
#endif
- int decode_errors = 0;
+ int frame_time = 1, decode_errors = 0;
while (1) {
+#ifdef DRAW_OVERLAY
+ int frame_start = TIMER_VALUE(1);
+#endif
+
// Wait for a full frame to be read from the disc and decompress the
// bitstream into the format expected by the MDEC. If the video has
// ended, restart playback from the beginning.
@@ -355,38 +360,45 @@ int main(int argc, const char* argv[]) {
}
#ifdef DRAW_OVERLAY
- // Measure CPU usage of the decompressor using the hblank counter.
- int total_time = TIMER_VALUE(1) + 1;
- TIMER_VALUE(1) = 0;
+ int decode_time = TIMER_VALUE(1);
#endif
- if (DecDCTvlc(frame->bs_data, frame->mdec_data)) {
+ VLC_Context vlc_ctx;
+ if (DecDCTvlcStart(
+ &vlc_ctx,
+ frame->mdec_data,
+ sizeof(frame->mdec_data) / 4,
+ frame->bs_data
+ )) {
decode_errors++;
continue;
}
#ifdef DRAW_OVERLAY
- int cpu_usage = TIMER_VALUE(1) * 100 / total_time;
+ // Calculate CPU usage of the decompressor.
+ decode_time = (TIMER_VALUE(1) - decode_time) & 0xffff;
+ int cpu_usage = decode_time * 100 / frame_time;
#endif
// Wait for the MDEC to finish decoding the previous frame, then flip
// the framebuffers to display it and prepare the buffer for the next
// frame.
- // NOTE: you should *not* call VSync(0) during playback, as the refresh
- // rate of the GPU is not synced to the video's frame rate. If you want
- // to minimize screen tearing, consider triple buffering instead (i.e.
- // always keep 2 fully decoded frames in VRAM and use VSyncCallback()
- // to register a function that displays the next decoded frame whenever
- // vblank occurs).
+ // NOTE: as the refresh rate of the GPU is not synced to the video's
+ // frame rate, this VSync(0) call may potentially end up waiting too
+ // long and desynchronizing playback. A better solution would be to
+ // implement triple buffering (i.e. always keep 2 fully decoded frames
+ // in VRAM and use VSyncCallback() to register a function that displays
+ // the next decoded frame if available whenever vblank occurs).
+ VSync(0);
DecDCTinSync(0);
DecDCToutSync(0);
#ifdef DRAW_OVERLAY
- FntPrint(-1, "FRAME:%5d READ ERRORS: %5d\n", str_ctx.frame_id, str_ctx.dropped_frames);
- FntPrint(-1, "CPU: %5d%% DECODE ERRORS:%5d\n", cpu_usage, decode_errors);
+ FntPrint(-1, "FRAME:%6d READ ERRORS: %6d\n", str_ctx.frame_id, str_ctx.dropped_frames);
+ FntPrint(-1, "CPU: %6d%% DECODE ERRORS:%6d\n", cpu_usage, decode_errors);
FntFlush(-1);
#endif
- display(&ctx, 0);
+ display(&ctx);
// Feed the newly decompressed frame to the MDEC. The MDEC will not
// actually start decoding it until an output buffer is also configured
@@ -414,6 +426,10 @@ int main(int argc, const char* argv[]) {
str_ctx.slices[str_ctx.cur_slice],
BLOCK_SIZE * str_ctx.slice_pos.h / 2
);
+
+#ifdef DRAW_OVERLAY
+ frame_time = (TIMER_VALUE(1) - frame_start) & 0xffff;
+#endif
}
return 0;
diff --git a/libpsn00b/include/psxpress.h b/libpsn00b/include/psxpress.h
index dc1d52c..c3b13f4 100644
--- a/libpsn00b/include/psxpress.h
+++ b/libpsn00b/include/psxpress.h
@@ -1,6 +1,6 @@
/*
* PSn00bSDK MDEC library
- * (C) 2022 spicyjpeg - MPL licensed
+ * (C) 2022-2023 spicyjpeg - MPL licensed
*/
/**
@@ -17,7 +17,9 @@
* FMV playback is not part of this library per se, but can implemented using
* the APIs defined here alongside some code to stream data from the CD drive.
*
- * Currently only version 1 and 2 .BS files are supported.
+ * Currently bitstream versions 1, 2 and 3 are supported. Version 0 and .IKI
+ * bitstreams are not supported, but no encoder is publicly available for those
+ * anyway.
*/
#ifndef __PSXPRESS_H
@@ -34,28 +36,26 @@ typedef struct _DECDCTENV {
int16_t dct[64]; // Inverse DCT matrix (2.14 fixed-point)
} DECDCTENV;
-// This is the "small" lookup table used by DecDCTvlc(). It can be copied to
-// the scratchpad.
+typedef struct _VLC_TableV2 {
+ uint16_t ac0[2];
+ uint32_t ac2[8], ac3[64];
+ uint16_t ac4[8], ac5[8], ac7[16], ac8[32];
+ uint16_t ac9[32], ac10[32], ac11[32], ac12[32];
+} VLC_TableV2;
+
+typedef struct _VLC_TableV3 {
+ uint16_t ac0[2];
+ uint32_t ac2[8], ac3[64];
+ uint16_t ac4[8], ac5[8], ac7[16], ac8[32];
+ uint16_t ac9[32], ac10[32], ac11[32], ac12[32];
+ uint8_t dc[128], dc_len[9];
+ uint8_t _reserved[3];
+} VLC_TableV3;
+
typedef struct _DECDCTTAB {
- uint16_t lut0[2];
- uint32_t lut2[8];
- uint32_t lut3[64];
- uint16_t lut4[8];
- uint16_t lut5[8];
- uint16_t lut7[16];
- uint16_t lut8[32];
- uint16_t lut9[32];
- uint16_t lut10[32];
- uint16_t lut11[32];
- uint16_t lut12[32];
+ uint32_t ac[8192], ac00[512];
} DECDCTTAB;
-// This is the "large" table used by DecDCTvlc2().
-typedef struct _DECDCTTAB2 {
- uint32_t lut[8192];
- uint32_t lut00[512];
-} DECDCTTAB2;
-
typedef enum _DECDCTMODE {
DECDCT_MODE_24BPP = 1,
DECDCT_MODE_16BPP = 0,
@@ -66,8 +66,9 @@ typedef enum _DECDCTMODE {
typedef struct _VLC_Context {
const uint32_t *input;
uint32_t window, next_window, remaining;
- uint16_t quant_scale;
int8_t is_v3, bit_offset, block_index, coeff_index;
+ uint16_t quant_scale;
+ int16_t last_y, last_cr, last_cb;
} VLC_Context;
// Despite what some docs claim, the "number of 32-byte blocks" and "always
@@ -233,8 +234,9 @@ int DecDCToutSync(int mode);
* frame) into a buffer that can be passed to DecDCTin(). This function uses a
* small (<1 KB) lookup table combined with the GTE to accelerate the process;
* performance is roughly on par with DecDCTvlcStart2() if the lookup table
- * is copied to the scratchpad beforehand by calling DecDCTvlcCopyTable(). The
- * contents of the GTE's LZCR register, if any, will be destroyed.
+ * is copied to the scratchpad beforehand by calling DecDCTvlcCopyTableV2() or
+ * DecDCTvlcCopyTableV3(). The contents of the GTE's LZCS and LZCR registers,
+ * if any, will be destroyed.
*
* A VLC_Context object must be created and passed to this function, which will
* then proceed to initialize its fields. The max_size argument sets the
@@ -256,7 +258,7 @@ int DecDCToutSync(int mode);
* @param bs
* @return 0, 1 if more data needs to be output or -1 in case of failure
*
- * @see DecDCTvlcContinue(), DecDCTvlcCopyTable()
+ * @see DecDCTvlcContinue(), DecDCTvlcCopyTableV2(), DecDCTvlcCopyTableV3()
*/
int DecDCTvlcStart(VLC_Context *ctx, uint32_t *buf, size_t max_size, const uint32_t *bs);
@@ -275,7 +277,8 @@ int DecDCTvlcStart(VLC_Context *ctx, uint32_t *buf, size_t max_size, const uint3
* context returned 0; in that case the context shall be discarded or reused to
* decode another bitstream.
*
- * The contents of the GTE's LZCR register, if any, will be destroyed.
+ * The contents of the GTE's LZCS and LZCR registers, if any, will be
+ * destroyed.
*
* See DecDCTvlcStart() for more details.
*
@@ -309,7 +312,7 @@ int DecDCTvlcContinue(VLC_Context *ctx, uint32_t *buf, size_t max_size);
* @param buf
* @return 0, 1 if more data needs to be output or -1 in case of failure
*
- * @see DecDCTvlcSize(), DecDCTvlcCopyTable()
+ * @see DecDCTvlcSize(), DecDCTvlcCopyTableV2(), DecDCTvlcCopyTableV3()
*/
int DecDCTvlc(const uint32_t *bs, uint32_t *buf);
@@ -332,23 +335,60 @@ int DecDCTvlc(const uint32_t *bs, uint32_t *buf);
size_t DecDCTvlcSize(size_t size);
/**
- * @brief Moves the lookup table used by the .BS decompressor to the scratchpad
- * region.
+ * @brief Copies the lookup tables used by the .BS decompressor (v1/v2) to the
+ * scratchpad region.
+ *
+ * @details Copies the lookup table used by DecDCTvlcContinue(),
+ * DecDCTvlcStart() and DecDCTvlc() to the specified address. A copy of this
+ * table is always present in main RAM, however this function can be used to
+ * copy it to the scratchpad region to boost decompression performance.
+ *
+ * This function copies a 676-byte table (VLC_TableV2 structure) containing
+ * only the data necessary for decoding version 1 and 2 bitstreams, to help
+ * save scratchpad space. If support for version 3 is required,
+ * DecDCTvlcCopyTableV3() can be used instead to copy the full 816-byte table.
+ *
+ * The address passed to this function is saved. Calls to DecDCTvlcStart(),
+ * DecDCTvlcContinue() and DecDCTvlc() will automatically use the last table
+ * copied. Call DecDCTvlcCopyTableV2(0) or DecDCTvlcCopyTableV3(0) to revert to
+ * using the library's internal table in main RAM.
+ *
+ * WARNING: attempting to decode a version 3 .BS file or .STR frame after
+ * calling this function will result in undefined behavior and potentially a
+ * crash. To re-enable version 3 decoding, use DecDCTvlcCopyTableV3() to copy
+ * the full table to the scratchpad or revert to using the built-in table in
+ * main RAM.
+ *
+ * @param addr Pointer to free 676-byte area in scratchpad region or 0 to reset
+ *
+ * @see DecDCTvlcCopyTableV3()
+ */
+void DecDCTvlcCopyTableV2(VLC_TableV2 *addr);
+
+/**
+ * @brief Copies the lookup tables used by the .BS decompressor (v1/v2/v3) to
+ * the scratchpad region.
*
- * @details Copies the small (<1 KB) lookup table used by DecDCTvlcContinue(),
- * DecDCTvlcStart() and DecDCTvlc() (a DECDCTTAB structure) to the specified
- * address. A copy of this table is always present in main RAM, however this
- * function can be used to copy it to the scratchpad region to boost
- * decompression performance.
+ * @details Copies the lookup table used by DecDCTvlcContinue(),
+ * DecDCTvlcStart() and DecDCTvlc() to the specified address. A copy of this
+ * table is always present in main RAM, however this function can be used to
+ * copy it to the scratchpad region to boost decompression performance.
+ *
+ * This function copies the full 816-byte table (VLC_TableV3 structure),
+ * including the data used to decode version 3 bitstreams. If support for
+ * version 3 is not required, DecDCTvlcCopyTableV2() can be used instead to
+ * save scratchpad space by only copying the first 676 bytes of the table.
*
* The address passed to this function is saved. Calls to DecDCTvlcStart(),
* DecDCTvlcContinue() and DecDCTvlc() will automatically use the last table
- * copied. Call DecDCTvlcCopyTable(0) to revert to using the library's internal
- * table in main RAM.
+ * copied. Call DecDCTvlcCopyTableV2(0) or DecDCTvlcCopyTableV3(0) to revert to
+ * using the library's internal table in main RAM.
+ *
+ * @param addr Pointer to free 816-byte area in scratchpad region or 0 to reset
*
- * @param addr Pointer to free area in scratchpad region or 0 to reset
+ * @see DecDCTvlcCopyTableV2()
*/
-void DecDCTvlcCopyTable(DECDCTTAB *addr);
+void DecDCTvlcCopyTableV3(VLC_TableV3 *addr);
/**
* @brief Decompresses or begins decompressing a .BS file into MDEC codes
@@ -360,8 +400,8 @@ void DecDCTvlcCopyTable(DECDCTTAB *addr);
* calling DecDCTvlcBuild(), but does not use the GTE nor the scratchpad.
* Depending on the specific bitstream being decoded DecDCTvlcStart2() might be
* slightly faster or slower than DecDCTvlcStart() with its lookup table copied
- * to the scratchpad (see DecDCTvlcCopyTable()). DecDCTvlcStart() with the
- * table in main RAM tends to be much slower.
+ * to the scratchpad (see DecDCTvlcCopyTableV2() and DecDCTvlcCopyTableV3()).
+ * DecDCTvlcStart() with the table in main RAM tends to be much slower.
*
* A VLC_Context object must be created and passed to this function, which will
* then proceed to initialize its fields. The max_size argument sets the
@@ -432,7 +472,7 @@ int DecDCTvlcContinue2(VLC_Context *ctx, uint32_t *buf, size_t max_size);
*
* @see DecDCTvlcSize2(), DecDCTvlcBuild()
*/
-int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB2 *table);
+int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB *table);
/**
* @brief Sets the maximum amount of data to be decompressed (alternate
@@ -458,7 +498,7 @@ size_t DecDCTvlcSize2(size_t size);
* the .BS decompressor.
*
* @details Generates the lookup table required by DecDCTvlcStart2(),
- * DecDCTvlcContinue2() and DecDCTvlc2() (a DECDCTTAB2 structure) into the
+ * DecDCTvlcContinue2() and DecDCTvlc2() (a DECDCTTAB structure) into the
* specified buffer. Since the table is relatively large (34 KB), it is
* recommended to only generate it in a dynamically-allocated buffer when
* needed and deallocate the buffer afterwards.
@@ -468,7 +508,7 @@ size_t DecDCTvlcSize2(size_t size);
*
* @param table
*/
-void DecDCTvlcBuild(DECDCTTAB2 *table);
+void DecDCTvlcBuild(DECDCTTAB *table);
#ifdef __cplusplus
}
diff --git a/libpsn00b/psxpress/README.md b/libpsn00b/psxpress/README.md
index a894874..df18ec5 100644
--- a/libpsn00b/psxpress/README.md
+++ b/libpsn00b/psxpress/README.md
@@ -1,14 +1,19 @@
# PSn00bSDK MDEC library
-This is a fully open source reimplementation of the official SDK's "data
+This is a fully original reimplementation of the official SDK's "data
compression" library. This library is made up of two parts, the MDEC API and
functions to decompress Huffman-encoded bitstreams (.BS files, or frames in
-.STR files) into data to be fed to the MDEC. FMV playback is not part of this
-library (nor the official one) per se, but can implemented by using these APIs
-alongside some code to stream data from the CD drive.
+.STR files) into data to be fed to the MDEC. Two different implementations of
+the latter are provided, one using the GTE and scratchpad region and an older
+one using a large lookup table in main RAM.
-**Currently only version 1 and 2 bitstreams are supported**.
+FMV playback is not part of this library per se, but can implemented using the
+APIs defined here alongside some code to stream data from the CD drive.
+
+Currently bitstream versions 1, 2 and 3 are supported. Version 0 and .IKI
+bitstreams are not supported, but no encoder is publicly available for those
+anyway.
## MDEC API
@@ -26,14 +31,16 @@ The following functions are currently provided:
- `DecDCTvlcStart()`, `DecDCTvlcContinue()`: a decompressor implementation that
uses a small (<1 KB) lookup table and leverages the GTE, written in assembly.
- `DecDCTvlcCopyTable()` can optionally be called to temporarily move the table
- to the scratchpad region to improve decompression speed.
-- `DecDCTvlcStart2()`, `DecDCTvlcContinue2()`: a different implementation using
+ `DecDCTvlcCopyTableV2()` or `DecDCTvlcCopyTableV3()` may optionally be called
+ to temporarily move the table to the scratchpad region in order to boost
+ decompression speed.
+- `DecDCTvlcStart2()`, `DecDCTvlcContinue2()`: an older implementation using
a large (34 KB) lookup table in main RAM, written in C. The table must be
- decompressed ahead of time using `DecDCTvlcBuild()`, but can be deallocated
- when no longer needed.
+ decompressed ahead of time manually using `DecDCTvlcBuild()`, but can be
+ deallocated when no longer needed. **This implementation does not support**
+ **version 3 bitstreams**.
- `DecDCTvlc()`, `DecDCTvlc2()`: wrappers around the functions listed above,
- for compatibility with the Sony SDK. Using them is not recommended.
+ for compatibility with the Sony SDK.
## SPU ADPCM encoding API
diff --git a/libpsn00b/psxpress/mdec.c b/libpsn00b/psxpress/mdec.c
index 3596188..394a0ce 100644
--- a/libpsn00b/psxpress/mdec.c
+++ b/libpsn00b/psxpress/mdec.c
@@ -1,12 +1,11 @@
/*
* PSn00bSDK MDEC library (low-level MDEC/DMA API)
- * (C) 2022 spicyjpeg - MPL licensed
+ * (C) 2022-2023 spicyjpeg - MPL licensed
*/
#include <stdint.h>
#include <assert.h>
#include <psxetc.h>
-#include <psxapi.h>
#include <psxpress.h>
#include <hwregs_c.h>
@@ -15,14 +14,14 @@
/* Default IDCT matrix and quantization tables */
-#define S0 0x5a82 // 0x4000 * cos(0/16 * pi) * sqrt(2)
-#define S1 0x7d8a // 0x4000 * cos(1/16 * pi) * 2
-#define S2 0x7641 // 0x4000 * cos(2/16 * pi) * 2
-#define S3 0x6a6d // 0x4000 * cos(3/16 * pi) * 2
-#define S4 0x5a82 // 0x4000 * cos(4/16 * pi) * 2
-#define S5 0x471c // 0x4000 * cos(5/16 * pi) * 2
-#define S6 0x30fb // 0x4000 * cos(6/16 * pi) * 2
-#define S7 0x18f8 // 0x4000 * cos(7/16 * pi) * 2
+#define S0 0x5a82 // (1 << 14) * cos(0/16 * pi) * sqrt(2)
+#define S1 0x7d8a // (1 << 14) * cos(1/16 * pi) * 2
+#define S2 0x7641 // (1 << 14) * cos(2/16 * pi) * 2
+#define S3 0x6a6d // (1 << 14) * cos(3/16 * pi) * 2
+#define S4 0x5a82 // (1 << 14) * cos(4/16 * pi) * 2
+#define S5 0x471c // (1 << 14) * cos(5/16 * pi) * 2
+#define S6 0x30fb // (1 << 14) * cos(6/16 * pi) * 2
+#define S7 0x18f8 // (1 << 14) * cos(7/16 * pi) * 2
static const DECDCTENV _default_mdec_env = {
// The default luma and chroma quantization table is based on the MPEG-1
@@ -85,8 +84,6 @@ static const DECDCTENV _default_mdec_env = {
/* Public API */
void DecDCTReset(int mode) {
- FastEnterCriticalSection();
-
SetDMAPriority(DMA_MDEC_IN, 3);
SetDMAPriority(DMA_MDEC_OUT, 3);
DMA_CHCR(DMA_MDEC_IN) = 0x00000201; // Stop DMA
@@ -95,26 +92,28 @@ void DecDCTReset(int mode) {
MDEC1 = 0x80000000; // Reset MDEC
MDEC1 = 0x60000000; // Enable DMA in/out requests
- FastExitCriticalSection();
if (!mode)
DecDCTPutEnv(0, 0);
}
void DecDCTPutEnv(const DECDCTENV *env, int mono) {
- const DECDCTENV *_env = env ? env : &_default_mdec_env;
DecDCTinSync(0);
+ if (!env)
+ env = &_default_mdec_env;
MDEC0 = 0x60000000; // Set IDCT matrix
- DecDCTinRaw((const uint32_t *) _env->dct, 32);
+ DecDCTinRaw((const uint32_t *) env->dct, 32);
DecDCTinSync(0);
- MDEC0 = 0x40000000 | (mono ? 0 : 1); // Set table(s)
- DecDCTinRaw((const uint32_t *) _env->iq_y, mono ? 16 : 32);
+ MDEC0 = 0x40000000 | (mono ? 0 : 1); // Set quantization table(s)
+ DecDCTinRaw((const uint32_t *) env->iq_y, mono ? 16 : 32);
DecDCTinSync(0);
}
void DecDCTin(const uint32_t *data, int mode) {
uint32_t header = *data;
+ DecDCTinSync(0);
+
if (mode == DECDCT_MODE_RAW)
MDEC0 = header;
else if (mode & DECDCT_MODE_24BPP)
@@ -153,7 +152,7 @@ int DecDCTinSync(int mode) {
return 0;
}
- _sdk_log("DecDCTinSync() timeout\n");
+ _sdk_log("DecDCTinSync() timeout, MDEC1=0x%08x\n", MDEC1);
return -1;
}
@@ -184,6 +183,6 @@ int DecDCToutSync(int mode) {
return 0;
}
- _sdk_log("DecDCToutSync() timeout\n");
+ _sdk_log("DecDCToutSync() timeout, CHCR=0x%08x\n", DMA_CHCR(DMA_MDEC_OUT));
return -1;
}
diff --git a/libpsn00b/psxpress/vlc.c b/libpsn00b/psxpress/vlc.c
index 4e3e283..36cfbe2 100644
--- a/libpsn00b/psxpress/vlc.c
+++ b/libpsn00b/psxpress/vlc.c
@@ -1,6 +1,6 @@
/*
* PSn00bSDK MDEC library (support code for the main VLC decompressor)
- * (C) 2022 spicyjpeg - MPL licensed
+ * (C) 2022-2023 spicyjpeg - MPL licensed
*/
#include <stdint.h>
@@ -10,87 +10,120 @@
/* Huffman code lookup table */
-#define _val1(rl, dc) (((rl) << 10) | ((uint16_t) (dc) & 0x3ff))
-#define _val2(rl, dc, len) (_val1(rl, dc) | (len << 16))
+#define _DC(y, c) (((y) << 4) | (c))
+#define _AC(rl, dc) (((rl) << 10) | ((uint16_t) (dc) & 0x3ff))
+#define _ACL(rl, dc, len) (_AC(rl, dc) | ((len) << 16))
-#define _pair(rl, dc) _val1(rl, dc), _val1(rl, -(dc))
-#define _pair2(rl, dc, len) _val2(rl, dc, len), _val2(rl, -(dc), len)
-#define _pair3(rl, dc, len) \
- _val2(rl, dc, len), _val2(rl, dc, len), \
- _val2(rl, -(dc), len), _val2(rl, -(dc), len)
-#define _pair4(rl, dc, len) \
- _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), \
- _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), \
- _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), \
- _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len)
+#define _DC2(y, c) _DC(y, c), _DC(y, c)
+#define _DC3(y, c) _DC(y, c), _DC(y, c), _DC(y, c), _DC(y, c)
+#define _DC4(y, c) \
+ _DC(y, c), _DC(y, c), _DC(y, c), _DC(y, c), \
+ _DC(y, c), _DC(y, c), _DC(y, c), _DC(y, c)
+#define _AC2(rl, dc) _AC(rl, dc), _AC(rl, -(dc))
+#define _ACL2(rl, dc, len) _ACL(rl, dc, len), _ACL(rl, -(dc), len)
+#define _ACL3(rl, dc, len) \
+ _ACL(rl, dc, len), _ACL(rl, dc, len), \
+ _ACL(rl, -(dc), len), _ACL(rl, -(dc), len)
+#define _ACL4(rl, dc, len) \
+ _ACL(rl, dc, len), _ACL(rl, dc, len), _ACL(rl, dc, len), _ACL(rl, dc, len), \
+ _ACL(rl, dc, len), _ACL(rl, dc, len), _ACL(rl, dc, len), _ACL(rl, dc, len), \
+ _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), \
+ _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), _ACL(rl, -(dc), len)
// This table isn't compressed since it makes no sense to compress less than a
// kilobyte's worth of data.
-static const DECDCTTAB _default_huffman_table = {
- .lut0 = {
+static const VLC_TableV3 _default_huffman_table = {
+ .ac0 = {
// 11 x
- _pair( 0, 1)
+ _AC2( 0, 1)
},
- .lut2 = {
+ .ac2 = {
// 01 0xx
- _pair2( 0, 2, 5), _pair2( 2, 1, 5),
+ _ACL2( 0, 2, 5), _ACL2( 2, 1, 5),
// 01 1x-
- _pair3( 1, 1, 4)
+ _ACL3( 1, 1, 4)
},
- .lut3 = {
+ .ac3 = {
// 001 00xxxx
- _pair2(13, 1, 9), _pair2( 0, 6, 9), _pair2(12, 1, 9), _pair2(11, 1, 9),
- _pair2( 3, 2, 9), _pair2( 1, 3, 9), _pair2( 0, 5, 9), _pair2(10, 1, 9),
+ _ACL2(13, 1, 9), _ACL2( 0, 6, 9), _ACL2(12, 1, 9), _ACL2(11, 1, 9),
+ _ACL2( 3, 2, 9), _ACL2( 1, 3, 9), _ACL2( 0, 5, 9), _ACL2(10, 1, 9),
// 001 xxx---
- _pair4( 0, 3, 6), _pair4( 4, 1, 6), _pair4( 3, 1, 6)
+ _ACL4( 0, 3, 6), _ACL4( 4, 1, 6), _ACL4( 3, 1, 6)
},
- .lut4 = {
+ .ac4 = {
// 0001 xxx
- _pair( 7, 1), _pair( 6, 1), _pair( 1, 2), _pair( 5, 1)
+ _AC2( 7, 1), _AC2( 6, 1), _AC2( 1, 2), _AC2( 5, 1)
},
- .lut5 = {
+ .ac5 = {
// 00001 xxx
- _pair( 2, 2), _pair( 9, 1), _pair( 0, 4), _pair( 8, 1)
+ _AC2( 2, 2), _AC2( 9, 1), _AC2( 0, 4), _AC2( 8, 1)
},
- .lut7 = {
+ .ac7 = {
// 0000001 xxxx
- _pair(16, 1), _pair( 5, 2), _pair( 0, 7), _pair( 2, 3),
- _pair( 1, 4), _pair(15, 1), _pair(14, 1), _pair( 4, 2)
+ _AC2(16, 1), _AC2( 5, 2), _AC2( 0, 7), _AC2( 2, 3),
+ _AC2( 1, 4), _AC2(15, 1), _AC2(14, 1), _AC2( 4, 2)
},
- .lut8 = {
+ .ac8 = {
// 00000001 xxxxx
- _pair( 0, 11), _pair( 8, 2), _pair( 4, 3), _pair( 0, 10),
- _pair( 2, 4), _pair( 7, 2), _pair(21, 1), _pair(20, 1),
- _pair( 0, 9), _pair(19, 1), _pair(18, 1), _pair( 1, 5),
- _pair( 3, 3), _pair( 0, 8), _pair( 6, 2), _pair(17, 1)
+ _AC2( 0, 11), _AC2( 8, 2), _AC2( 4, 3), _AC2( 0, 10),
+ _AC2( 2, 4), _AC2( 7, 2), _AC2(21, 1), _AC2(20, 1),
+ _AC2( 0, 9), _AC2(19, 1), _AC2(18, 1), _AC2( 1, 5),
+ _AC2( 3, 3), _AC2( 0, 8), _AC2( 6, 2), _AC2(17, 1)
},
- .lut9 = {
+ .ac9 = {
// 000000001 xxxxx
- _pair(10, 2), _pair( 9, 2), _pair( 5, 3), _pair( 3, 4),
- _pair( 2, 5), _pair( 1, 7), _pair( 1, 6), _pair( 0, 15),
- _pair( 0, 14), _pair( 0, 13), _pair( 0, 12), _pair(26, 1),
- _pair(25, 1), _pair(24, 1), _pair(23, 1), _pair(22, 1)
+ _AC2(10, 2), _AC2( 9, 2), _AC2( 5, 3), _AC2( 3, 4),
+ _AC2( 2, 5), _AC2( 1, 7), _AC2( 1, 6), _AC2( 0, 15),
+ _AC2( 0, 14), _AC2( 0, 13), _AC2( 0, 12), _AC2(26, 1),
+ _AC2(25, 1), _AC2(24, 1), _AC2(23, 1), _AC2(22, 1)
},
- .lut10 = {
+ .ac10 = {
// 0000000001 xxxxx
- _pair( 0, 31), _pair( 0, 30), _pair( 0, 29), _pair( 0, 28),
- _pair( 0, 27), _pair( 0, 26), _pair( 0, 25), _pair( 0, 24),
- _pair( 0, 23), _pair( 0, 22), _pair( 0, 21), _pair( 0, 20),
- _pair( 0, 19), _pair( 0, 18), _pair( 0, 17), _pair( 0, 16)
+ _AC2( 0, 31), _AC2( 0, 30), _AC2( 0, 29), _AC2( 0, 28),
+ _AC2( 0, 27), _AC2( 0, 26), _AC2( 0, 25), _AC2( 0, 24),
+ _AC2( 0, 23), _AC2( 0, 22), _AC2( 0, 21), _AC2( 0, 20),
+ _AC2( 0, 19), _AC2( 0, 18), _AC2( 0, 17), _AC2( 0, 16)
},
- .lut11 = {
+ .ac11 = {
// 00000000001 xxxxx
- _pair( 0, 40), _pair( 0, 39), _pair( 0, 38), _pair( 0, 37),
- _pair( 0, 36), _pair( 0, 35), _pair( 0, 34), _pair( 0, 33),
- _pair( 0, 32), _pair( 1, 14), _pair( 1, 13), _pair( 1, 12),
- _pair( 1, 11), _pair( 1, 10), _pair( 1, 9), _pair( 1, 8)
+ _AC2( 0, 40), _AC2( 0, 39), _AC2( 0, 38), _AC2( 0, 37),
+ _AC2( 0, 36), _AC2( 0, 35), _AC2( 0, 34), _AC2( 0, 33),
+ _AC2( 0, 32), _AC2( 1, 14), _AC2( 1, 13), _AC2( 1, 12),
+ _AC2( 1, 11), _AC2( 1, 10), _AC2( 1, 9), _AC2( 1, 8)
},
- .lut12 = {
+ .ac12 = {
// 000000000001 xxxxx
- _pair( 1, 18), _pair( 1, 17), _pair( 1, 16), _pair( 1, 15),
- _pair( 6, 3), _pair(16, 2), _pair(15, 2), _pair(14, 2),
- _pair(13, 2), _pair(12, 2), _pair(11, 2), _pair(31, 1),
- _pair(30, 1), _pair(29, 1), _pair(28, 1), _pair(27, 1)
+ _AC2( 1, 18), _AC2( 1, 17), _AC2( 1, 16), _AC2( 1, 15),
+ _AC2( 6, 3), _AC2(16, 2), _AC2(15, 2), _AC2(14, 2),
+ _AC2(13, 2), _AC2(12, 2), _AC2(11, 2), _AC2(31, 1),
+ _AC2(30, 1), _AC2(29, 1), _AC2(28, 1), _AC2(27, 1)
+ },
+ .dc = {
+ // 00-----
+ _DC4(1, 0), _DC4(1, 0), _DC4(1, 0), _DC4(1, 0),
+ // 01-----
+ _DC4(2, 1), _DC4(2, 1), _DC4(2, 1), _DC4(2, 1),
+ // 100----
+ _DC4(0, 2), _DC4(0, 2),
+ // 101----
+ _DC4(3, 2), _DC4(3, 2),
+ // 110----
+ _DC4(4, 3), _DC4(4, 3),
+ // 1110---
+ _DC4(5, 4),
+ // 11110--
+ _DC3(6, 5),
+ // 111110-
+ _DC2(7, 6),
+ // 1111110
+ _DC(8, 7),
+ // 1111111(0)
+ _DC(0, 8)
+ },
+ .dc_len = {
+ _DC(3, 2), _DC(2, 2), _DC(2, 2), _DC(3, 3),
+ _DC(3, 4), _DC(4, 5), _DC(5, 6), _DC(6, 7),
+ _DC(7, 8)
}
};
@@ -100,7 +133,7 @@ static const DECDCTTAB _default_huffman_table = {
static VLC_Context _default_context;
static size_t _max_buffer_size = 0;
-const DECDCTTAB *_vlc_huffman_table = &_default_huffman_table;
+const VLC_TableV3 *_vlc_huffman_table = &_default_huffman_table;
/* Stateful VLC decoder API (for Sony SDK compatibility) */
@@ -120,10 +153,19 @@ size_t DecDCTvlcSize(size_t size) {
/* Lookup table relocation API */
-void DecDCTvlcCopyTable(DECDCTTAB *addr) {
+void DecDCTvlcCopyTableV2(VLC_TableV2 *addr) {
+ if (addr) {
+ _vlc_huffman_table = (const VLC_TableV3 *) addr;
+ memcpy(addr, &_default_huffman_table, sizeof(VLC_TableV2));
+ } else {
+ _vlc_huffman_table = &_default_huffman_table;
+ }
+}
+
+void DecDCTvlcCopyTableV3(VLC_TableV3 *addr) {
if (addr) {
- _vlc_huffman_table = addr;
- memcpy(addr, &_default_huffman_table, sizeof(DECDCTTAB));
+ _vlc_huffman_table = (const VLC_TableV3 *) addr;
+ memcpy(addr, &_default_huffman_table, sizeof(VLC_TableV3));
} else {
_vlc_huffman_table = &_default_huffman_table;
}
diff --git a/libpsn00b/psxpress/vlc.s b/libpsn00b/psxpress/vlc.s
index f3a1c67..2de22f7 100644
--- a/libpsn00b/psxpress/vlc.s
+++ b/libpsn00b/psxpress/vlc.s
@@ -1,375 +1,576 @@
# PSn00bSDK MDEC library (GTE-accelerated VLC decompressor)
-# (C) 2022 spicyjpeg - MPL licensed
+# (C) 2022-2023 spicyjpeg - MPL licensed
#
-# Register map:
-# - $a0 = ctx
-# - $a1 = output
-# - $a2 = max_size
-# - $a3 = input
-# - $t0 = window
-# - $t1 = next_window
-# - $t2 = remaining
-# - $t3 = quant_scale
-# - $t4 = is_v3
-# - $t5 = bit_offset
-# - $t6 = block_index
-# - $t7 = coeff_index
-# - $t8 = _vlc_huffman_table
-# - $t9 = &ac_jump_area
+# TODO: reduce the size of the v3 DC coefficient decoder; currently the code is
+# duplicated for each block type, but it can probably be shortened with no
+# performance impact...
-.set noreorder
+.include "gtereg.inc"
-.set VLC_Context_input, 0
-.set VLC_Context_window, 4
-.set VLC_Context_next_window, 8
-.set VLC_Context_remaining, 12
-.set VLC_Context_quant_scale, 16
-.set VLC_Context_is_v3, 18
-.set VLC_Context_bit_offset, 19
-.set VLC_Context_block_index, 20
-.set VLC_Context_coeff_index, 21
-
-.set DECDCTTAB_lut0, 0
-.set DECDCTTAB_lut2, 4
-.set DECDCTTAB_lut3, 36
-.set DECDCTTAB_lut4, 292
-.set DECDCTTAB_lut5, 308
-.set DECDCTTAB_lut7, 324
-.set DECDCTTAB_lut8, 356
-.set DECDCTTAB_lut9, 420
-.set DECDCTTAB_lut10, 484
-.set DECDCTTAB_lut11, 548
-.set DECDCTTAB_lut12, 612
+.set noreorder
+.set noat
+
+.set value, $v0
+.set length, $v1
+.set ctx, $a0
+.set output, $a1
+.set max_size, $a2
+.set input, $a3
+.set temp, $t0
+.set window, $t1
+.set next_window, $t2
+.set remaining, $t3
+.set is_v3, $t4
+.set bit_offset, $t5
+.set block_index, $t6
+.set coeff_index, $t7
+.set quant_scale, $s0
+.set last_y, $s1
+.set last_cr, $s2
+.set last_cb, $s3
+.set huffman_table, $t8
+.set ac_jump_area, $t9
+
+.set VLC_Context_input, 0x0
+.set VLC_Context_window, 0x4
+.set VLC_Context_next_window, 0x8
+.set VLC_Context_remaining, 0xc
+.set VLC_Context_is_v3, 0x10
+.set VLC_Context_bit_offset, 0x11
+.set VLC_Context_block_index, 0x12
+.set VLC_Context_coeff_index, 0x13
+.set VLC_Context_quant_scale, 0x14
+.set VLC_Context_last_y, 0x16
+.set VLC_Context_last_cr, 0x18
+.set VLC_Context_last_cb, 0x1a
+
+.set VLC_Table_ac0, 0x0
+.set VLC_Table_ac2, 0x4
+.set VLC_Table_ac3, 0x24
+.set VLC_Table_ac4, 0x124
+.set VLC_Table_ac5, 0x134
+.set VLC_Table_ac7, 0x144
+.set VLC_Table_ac8, 0x164
+.set VLC_Table_ac9, 0x1a4
+.set VLC_Table_ac10, 0x1e4
+.set VLC_Table_ac11, 0x224
+.set VLC_Table_ac12, 0x264
+.set VLC_Table_dc, 0x2a4
+.set VLC_Table_dc_len, 0x324
.section .text.DecDCTvlcStart
.global DecDCTvlcStart
.type DecDCTvlcStart, @function
DecDCTvlcStart:
+ addiu $sp, -16
+ sw $s0, 0($sp)
+ sw $s1, 4($sp)
+ sw $s2, 8($sp)
+ sw $s3, 12($sp)
+
# Create a new context on-the-fly without writing it to memory then jump
# into DecDCTvlcContinue(), skipping context loading.
- lw $t0, 8($a3) # window = (bs->data[0] << 16) | (bs->data[0] >> 16)
- nop
- srl $v0, $t0, 16
- sll $t0, 16
-
- lw $t1, 12($a3) # next_window = (bs->data[1] << 16) | (bs->data[1] >> 16)
- or $t0, $v0
- srl $v0, $t1, 16
- sll $t1, 16
-
- lhu $t2, 0($a3) # remaining = bs->uncomp_length * 2
- or $t1, $v0
-
- lhu $t3, 4($a3) # quant_scale = (bs->quant_scale & 63) << 10
- sll $t2, 1
- andi $t3, 63
-
- lhu $t4, 6($a3) # is_v3 = !(bs->version < 3)
- sll $t3, 10
- sltiu $t4, $t4, 3
- xori $t4, 1
-
- li $t5, 32 # bit_offset = 32
- li $t6, 5 # block_index = 5
- li $t7, 0 # coeff_index = 0
+ lw window, 8(input) # window = (bs->data[0] << 16) | (bs->data[0] >> 16)
+ li last_y, 0
+ srl temp, window, 16
+ sll window, 16
+ or window, temp
+
+ # next_window = (bs->data[1] << 16) | (bs->data[1] >> 16)
+ lw next_window, 12(input)
+ li last_cr, 0
+ srl temp, next_window, 16
+ sll next_window, 16
+ or next_window, temp
+
+ lhu remaining, 0(input) # remaining = bs->uncomp_length * 2
+ li last_cb, 0
+ sll remaining, 1
+
+ lw temp, 4(input) # quant_scale = (bs->quant_scale & 63) << 10
+ li bit_offset, 32
+ andi quant_scale, temp, 63
+ sll quant_scale, 10
+
+ srl temp, 16 # is_v3 = !(bs->version < 3)
+ sltiu is_v3, temp, 3
+ xori is_v3, 1
+
+ li block_index, 5
+ li coeff_index, 0
j _vlc_skip_context_load
- addiu $a3, 16 # input = &(bs->data[2])
+ addiu input, 16 # input = &(bs->data[2])
.section .text.DecDCTvlcContinue
.global DecDCTvlcContinue
.type DecDCTvlcContinue, @function
DecDCTvlcContinue:
- lw $a3, VLC_Context_input($a0)
- lw $t0, VLC_Context_window($a0)
- lw $t1, VLC_Context_next_window($a0)
- lw $t2, VLC_Context_remaining($a0)
- lhu $t3, VLC_Context_quant_scale($a0)
- lb $t4, VLC_Context_is_v3($a0)
- lb $t5, VLC_Context_bit_offset($a0)
- lb $t6, VLC_Context_block_index($a0)
- lb $t7, VLC_Context_coeff_index($a0)
+ addiu $sp, -16
+ sw $s0, 0($sp)
+ sw $s1, 4($sp)
+ sw $s2, 8($sp)
+ sw $s3, 12($sp)
+
+ lw input, VLC_Context_input(ctx)
+ lw window, VLC_Context_window(ctx)
+ lw next_window, VLC_Context_next_window(ctx)
+ lw remaining, VLC_Context_remaining(ctx)
+ lb is_v3, VLC_Context_is_v3(ctx)
+ lb bit_offset, VLC_Context_bit_offset(ctx)
+ lb block_index, VLC_Context_block_index(ctx)
+ lb coeff_index, VLC_Context_coeff_index(ctx)
+ lhu quant_scale, VLC_Context_quant_scale(ctx)
+ lh last_y, VLC_Context_last_y(ctx)
+ lh last_cr, VLC_Context_last_cr(ctx)
+ lh last_cb, VLC_Context_last_cb(ctx)
_vlc_skip_context_load:
- # Determine how many bytes to output. This whole block of code basically
- # does this:
+ # Determine how many bytes to output.
+ # if (max_size <= 0) max_size = 0x3fff0000
# max_size = min((max_size - 1) * 2, remaining)
# remaining -= max_size
- bgtz $a2, .Lmax_size_valid # if (max_size <= 0) max_size = 0x7ffe0000
- addiu $a2, -1 # else max_size = (max_size - 1) * 2
- lui $a2, 0x3fff
+ bgtz max_size, .Lmax_size_valid
+ addiu max_size, -1
+ lui max_size, 0x3fff
.Lmax_size_valid:
- sll $a2, 1
+ sll max_size, 1
- blt $a2, $t2, .Lmax_size_ok # if (max_size > remaining) max_size = remaining
- lui $v1, 0x3800
- move $a2, $t2
-.Lmax_size_ok:
- subu $t2, $a2 # remaining -= max_size
+ subu remaining, max_size
+ bgez remaining, .Lmax_size_ok
+ lui temp, 0x3800
+ addu max_size, remaining
+ li remaining, 0
+
+.Lmax_size_ok:
# Write the length of the data that will be decoded to first 4 bytes of the
# output buffer, which will be then parsed by DecDCTin().
- srl $v0, $a2, 1 # output[0] = 0x38000000 | (max_size / 2)
- or $v0, $v1
- sw $v0, 0($a1)
+ srl value, max_size, 1 # output[0] = 0x38000000 | (max_size / 2)
+ or value, temp
+ sw value, 0(output)
# Obtain the addresses of the lookup table and jump area in advance so that
# they don't have to be retrieved for each coefficient decoded.
- lw $t8, _vlc_huffman_table
- la $t9, .Lac_prefix_10
+ lw huffman_table, _vlc_huffman_table
+ la ac_jump_area, .Lac_prefix_01 - 32
- beqz $a2, .Lstop_processing
- addiu $a1, 4 # output = (uint16_t *) &output[1]
+ beqz max_size, .Lstop_processing
+ addiu output, 4
.Lprocess_next_code_loop: # while (max_size)
# This is the "hot" part of the decoder, executed for each code in the
# bitstream. The first step is to determine if the next code is a DC or AC
- # coefficient.
- bnez $t7, .Lprocess_ac_coefficient
- addiu $t7, 1 # coeff_index++
- bnez $t4, .Lprocess_dc_v3_coefficient
- li $v1, 0x01ff
+ # coefficient; at the same time the GTE is given the task of counting the
+ # number of leading zeroes/ones in the code (which takes 2 more cycles).
+ mtc2 window, C2_LZCS
+
+ bnez coeff_index, .Lprocess_ac_coefficient
+ addiu coeff_index, 1
+ bnez is_v3, .Lprocess_dc_v3_coefficient
+ li temp, 0x1ff
.Lprocess_dc_v2_coefficient: # if (!coeff_index && !is_v3)
# The DC coefficient in version 2 frames is not compressed. Value 0x1ff is
# used to signal the end of the bitstream.
- srl $v0, $t0, 22 # prefix = (window >> (32 - 10))
- beq $v0, $v1, .Lstop_processing # if (prefix == 0x1ff) break
- or $v0, $t3 # *output = prefix | quant_scale
- sll $t0, 10 # window <<= 10
- b .Lwrite_value
- addiu $t5, -10 # bit_offset -= 10
+ # prefix = window >> (32 - 10)
+ # if (prefix == 0x1ff) break
+ # *output = prefix | quant_scale
+ srl value, window, 22
+ beq value, temp, .Lstop_processing
+ or value, quant_scale
+ sll window, 10
+ addiu bit_offset, -10
+
+ b .Lfeed_bitstream
+ sh value, 0(output)
.Lprocess_dc_v3_coefficient: # if (!coeff_index && is_v3)
- # TODO: version 3 is currently not supported.
- jr $ra
- li $v0, -1
-
-.Lprocess_ac_coefficient: # if (coeff_index)
- # Check whether the prefix code is one of the shorter, more common ones,
- # and start counting the number of leading zeroes/ones using the GTE (which
- # takes 2 more cycles).
- srl $v0, $t0, 30
- li $v1, 3
- beq $v0, $v1, .Lac_prefix_11
- li $v1, 2
- beq $v0, $v1, .Lac_prefix_10
- li $v1, 1
- mtc2 $t0, $30
- beq $v0, $v1, .Lac_prefix_01
+ # Version 3 DC coefficients are variable-length deltas, prefixed with a
+ # Huffman code indicating their length. Since the prefix code is up to 7
+ # bits long, it makes sense to decode it with a simple 128-byte lookup
+ # table rather than using the GTE. The codes are different for luma and
+ # chroma blocks, so each table entry contains the decoded length for both
+ # block types (packed as two nibbles). Prefix 111111111 is used to signal
+ # the end of the bitstream.
+ # prefix = window >> (32 - 9)
+ # if (prefix == 0x1ff) break
+ # lengths = huffman_table->dc[prefix >> 2]
+ srl length, window, 23
+ beq length, temp, .Lstop_processing
+ srl length, 2
+ addu length, huffman_table
+
+ addiu $at, block_index, -4
+ bltz $at, .Ldc_block_y
+ lbu length, VLC_Table_dc(length)
+ beqz $at, .Ldc_block_cb
+ andi length, 15 # if (block_index >= Cb) dc_length = lengths & 15
+
+.Ldc_block_cr: # if (block_index > Cb)
+ # prefix_length = huffman_table->dc_len[dc_length] & 15
+ addu temp, length, huffman_table
+ lbu temp, VLC_Table_dc_len(temp)
+ li $at, 32
+ andi temp, 15
+
+ sllv window, window, temp
+ beqz length, .Ldc_cr_zero # if (dc_length)
+ subu bit_offset, temp
+
+ subu $at, length # value = window >> (32 - dc_length)
+ srlv value, window, $at
+
+ # Decode the sign bit, then add the decoded delta to the current value.
+ # if (!(window >> 31)) value -= (1 << dc_length) - 1
+ bltz window, .Ldc_cr_positive
+ li temp, -1
+ srlv temp, temp, $at
+ subu value, temp
+.Ldc_cr_positive:
+ addu last_cr, value
+ andi last_cr, 0x3ff
+
+.Ldc_cr_zero:
+ sll temp, last_cr, 2 # *output = (last_cr << 2) | quant_scale
+ or temp, quant_scale
+ b .Lupdate_window_dc # update_window(dc_length)
+ sh temp, 0(output)
+
+.Ldc_block_cb: # if (block_index == Cb)
+ # prefix_length = huffman_table->dc_len[dc_length] & 15
+ addu temp, length, huffman_table
+ lbu temp, VLC_Table_dc_len(temp)
+ li $at, 32
+ andi temp, 15
+
+ sllv window, window, temp
+ beqz length, .Ldc_cb_zero # if (dc_length)
+ subu bit_offset, temp
+
+ subu $at, length # value = window >> (32 - dc_length)
+ srlv value, window, $at
+
+ # Decode the sign bit, then add the decoded delta to the current value.
+ # if (!(window >> 31)) value -= (1 << dc_length) - 1
+ bltz window, .Ldc_cb_positive
+ li temp, -1
+ srlv temp, temp, $at
+ subu value, temp
+.Ldc_cb_positive:
+ addu last_cb, value
+ andi last_cb, 0x3ff
+
+.Ldc_cb_zero:
+ sll value, last_cb, 2 # *output = (last_cb << 2) | quant_scale
+ or value, quant_scale
+ b .Lupdate_window_dc # update_window(dc_length)
+ sh value, 0(output)
+
+.Ldc_block_y: # if (block_index < Cb)
nop
+ srl length, 4 # dc_length = lengths >> 4
+
+ # prefix_length = huffman_table->dc_len[dc_length] >> 4
+ addu temp, length, huffman_table
+ lbu temp, VLC_Table_dc_len(temp)
+ li $at, 32
+ srl temp, 4
+
+ sllv window, window, temp
+ beqz length, .Ldc_y_zero # if (dc_length)
+ subu bit_offset, temp
+
+ sll temp, last_y, 2
+ subu $at, length # value = window >> (32 - dc_length)
+ srlv value, window, $at
+
+ # Decode the sign bit, then add the decoded delta to the current value.
+ # if (!(window >> 31)) value -= (1 << dc_length) - 1
+ bltz window, .Ldc_y_positive
+ li temp, -1
+ srlv temp, temp, $at
+ subu value, temp
+.Ldc_y_positive:
+ addu last_y, value
+ andi last_y, 0x3ff
+
+.Ldc_y_zero:
+ sll temp, last_y, 2 # *output = (last_y << 2) | quant_scale
+ or temp, quant_scale
+ b .Lupdate_window_dc # update_window(dc_length)
+ sh temp, 0(output)
- # If the code is longer, retrieve the number of leading zeroes from the GTE
- # and use it as an index into the jump area. Each block in the area is 8
- # instructions long and handles decoding a specific prefix.
- mfc2 $v0, $31
- li $v1, 11
- bgt $v0, $v1, .Lreturn_error # if (prefix > 11) return -1
- sll $v0, 5 # jump_addr = &ac_jump_area[prefix * 8 * sizeof(u32)]
- addu $v0, $t9
- jr $v0
+.Lprocess_ac_coefficient: # if (coeff_index)
+ # Check whether the prefix code is 10 or 11 (i.e. if it starts with 1). If
+ # not, retrieve the number of leading zeroes from the GTE and use it as an
+ # index into the jump area. Each block in the area is 8 instructions long
+ # and handles decoding a specific prefix.
+ mfc2 temp, C2_LZCR
+
+ bltz window, .Lac_prefix_1 # if (!(window >> 31))
+ addiu $at, temp, -11 # if (prefix > 11) return -1
+ bgtz $at, .Lreturn_error
+ sll temp, 5 # jump_addr = &ac_jump_area[prefix * 8 * sizeof(uint32_t)]
+ addu temp, ac_jump_area
+ jr temp
nop
.Lreturn_error:
- jr $ra
+ b .Lreturn
li $v0, -1
-.Lac_prefix_11:
- # Prefix 11 is followed by a single bit.
- srl $v0, $t0, 28 # index = ((window >> (32 - 2 - 1)) & 1) * sizeof(u16)
- andi $v0, 2
- addu $v0, $t8 # value = table->lut0[index]
- lhu $v0, DECDCTTAB_lut0($v0)
- sll $t0, 3 # window <<= 3
- b .Lwrite_value
- addiu $t5, -3 # bit_offset -= 3
- #.word 0
+.Lac_prefix_1: # if (window >> 31)
+ sll window, 1
+ bltz window, .Lac_prefix_11
+ li temp, 0xfe00
.Lac_prefix_10:
# Prefix 10 marks the end of a block.
- li $v0, 0xfe00 # value = 0xfe00
- sll $t0, 2 # window <<= 2
- addiu $t5, -2 # bit_offset -= 2
- addiu $t6, -1 # block_index--
- bgez $t6, .Lwrite_value
- li $t7, 0 # coeff_index = 0
- b .Lwrite_value
- li $t6, 5 # if (block_index < 0) block_index = 5
+ # *output = 0xfe00
+ # coeff_index = 0
+ # if (--block_index < Y3) block_index = Cr
+ sll window, 1
+ addiu bit_offset, -2
+ sh temp, 0(output)
+
+ addiu block_index, -1
+ bgez block_index, .Lfeed_bitstream
+ li coeff_index, 0
+ b .Lfeed_bitstream
+ li block_index, 5
+
+.Lac_prefix_11:
+ # Prefix 11 is followed by a single bit. Note that the 10/11 prefix check
+ # already shifts the window by one bit (without updating the bit offset).
+ # index = ((window >> (32 - 1 - 1)) & 1) * sizeof(uint16_t)
+ # *output = huffman_table->ac0[index]
+ srl value, window, 29
+ andi value, 2
+ addu value, huffman_table
+ lhu value, VLC_Table_ac0(value)
+ sll window, 2
+ addiu bit_offset, -3
+
+ b .Lfeed_bitstream
+ sh value, 0(output)
.Lac_prefix_01:
# Prefix 01 can be followed by a 2-bit lookup index starting with 1, or a
# 3-bit lookup index starting with 0. A 32-bit lookup table is used,
# containing both MDEC codes and lengths.
- srl $v0, $t0, 25 # index = ((window >> (32 - 2 - 3)) & 7) * sizeof(u32)
- andi $v0, 28
- addu $v0, $t8 # value = table->lut2[index]
- lw $v0, DECDCTTAB_lut2($v0)
- b .Lupdate_window_and_write
- srl $v1, $v0, 16 # length = value >> 16
+ # index = ((window >> (32 - 2 - 3)) & 7) * sizeof(uint32_t)
+ # *output = huffman_table->ac2[index] & 0xffff
+ # length = huffman_table->ac2[index] >> 16
+ srl value, window, 25
+ andi value, 28
+ addu value, huffman_table
+ lw value, VLC_Table_ac2(value)
+
+ b .Lupdate_window_ac # update_window(value >> 16)
+ sh value, 0(output)
.word 0, 0
.Lac_prefix_001:
# Prefix 001 can be followed by a 6-bit lookup index starting with 00, or a
# 3-bit lookup index starting with 01/10/11.
- srl $v0, $t0, 21 # index = ((window >> (32 - 3 - 6)) & 63) * sizeof(u32)
- andi $v0, 252
- addu $v0, $t8 # value = table->lut3[index]
- lw $v0, DECDCTTAB_lut3($v0)
- b .Lupdate_window_and_write
- srl $v1, $v0, 16 # length = value >> 16
+ # index = ((window >> (32 - 3 - 6)) & 63) * sizeof(uint32_t)
+ # *output = huffman_table->ac3[index] & 0xffff
+ # length = huffman_table->ac3[index] >> 16
+ srl value, window, 21
+ andi value, 252
+ addu value, huffman_table
+ lw value, VLC_Table_ac3(value)
+
+ b .Lupdate_window_ac # update_window(value >> 16)
+ sh value, 0(output)
.word 0, 0
.Lac_prefix_0001:
# Prefix 0001 is followed by a 3-bit lookup index.
- srl $v0, $t0, 24 # index = ((window >> (32 - 4 - 3)) & 7) * sizeof(u16)
- andi $v0, 14
- addu $v0, $t8 # value = table->lut4[index]
- lhu $v0, DECDCTTAB_lut4($v0)
- sll $t0, 7 # window <<= 4 + 3
- b .Lwrite_value
- addiu $t5, -7 # bit_offset -= 4 + 3
- .word 0
+ # index = ((window >> (32 - 4 - 3)) & 7) * sizeof(uint16_t)
+ # *output = huffman_table->ac4[index]
+ srl value, window, 24
+ andi value, 14
+ addu value, huffman_table
+ lhu value, VLC_Table_ac4(value)
+ sll window, 7
+ addiu bit_offset, -7
+
+ b .Lfeed_bitstream
+ sh value, 0(output)
.Lac_prefix_00001:
# Prefix 00001 is followed by a 3-bit lookup index.
- srl $v0, $t0, 23 # index = ((window >> (32 - 5 - 3)) & 7) * sizeof(u16)
- andi $v0, 14
- addu $v0, $t8 # value = table->lut5[index]
- lhu $v0, DECDCTTAB_lut5($v0)
- sll $t0, 8 # window <<= 5 + 3
- b .Lwrite_value
- addiu $t5, -8 # bit_offset -= 5 + 3
- .word 0
+ # index = ((window >> (32 - 5 - 3)) & 7) * sizeof(uint16_t)
+ # *output = huffman_table->ac5[index]
+ srl value, window, 23
+ andi value, 14
+ addu value, huffman_table
+ lhu value, VLC_Table_ac5(value)
+ sll window, 8
+ addiu bit_offset, -8
+
+ b .Lfeed_bitstream
+ sh value, 0(output)
.Lac_prefix_000001:
# Prefix 000001 is an escape code followed by a full 16-bit MDEC value.
- srl $v0, $t0, 10 # value = window >> (32 - 6 - 16)
- sll $t0, 22 # window <<= 6 + 16
- b .Lwrite_value
- addiu $t5, -22 # bit_offset -= 6 + 16
- .word 0, 0, 0, 0
+ # *output = window >> (32 - 6 - 16)
+ srl value, window, 10
+ sll window, 22
+ addiu bit_offset, -22
+
+ b .Lfeed_bitstream
+ sh value, 0(output)
+ .word 0, 0, 0
.Lac_prefix_0000001:
# Prefix 0000001 is followed by a 4-bit lookup index.
- srl $v0, $t0, 20 # index = ((window >> (32 - 7 - 4)) & 15) * sizeof(u16)
- andi $v0, 30
- addu $v0, $t8 # value = table->lut7[index]
- lhu $v0, DECDCTTAB_lut7($v0)
- sll $t0, 11 # window <<= 7 + 4
- b .Lwrite_value
- addiu $t5, -11 # bit_offset -= 7 + 4
- .word 0
+ # index = ((window >> (32 - 7 - 4)) & 15) * sizeof(uint16_t)
+ # *output = huffman_table->ac7[index]
+ srl value, window, 20
+ andi value, 30
+ addu value, huffman_table
+ lhu value, VLC_Table_ac7(value)
+ sll window, 11
+ addiu bit_offset, -11
+
+ b .Lfeed_bitstream
+ sh value, 0(output)
.Lac_prefix_00000001:
# Prefix 00000001 is followed by a 5-bit lookup index.
- srl $v0, $t0, 18 # index = ((window >> (32 - 8 - 5)) & 31) * sizeof(u16)
- andi $v0, 62
- addu $v0, $t8 # value = table->lut8[index]
- lhu $v0, DECDCTTAB_lut8($v0)
- sll $t0, 13 # window <<= 8 + 5
- b .Lwrite_value
- addiu $t5, -13 # bit_offset -= 8 + 5
- .word 0
+ # index = ((window >> (32 - 8 - 5)) & 31) * sizeof(uint16_t)
+ # *output = huffman_table->ac8[index]
+ srl value, window, 18
+ andi value, 62
+ addu value, huffman_table
+ lhu value, VLC_Table_ac8(value)
+ sll window, 13
+ addiu bit_offset, -13
+
+ b .Lfeed_bitstream
+ sh value, 0(output)
.Lac_prefix_000000001:
# Prefix 000000001 is followed by a 5-bit lookup index.
- srl $v0, $t0, 17 # index = ((window >> (32 - 9 - 5)) & 31) * sizeof(u16)
- andi $v0, 62
- addu $v0, $t8 # value = table->lut9[index]
- lhu $v0, DECDCTTAB_lut9($v0)
- sll $t0, 14 # window <<= 9 + 5
- b .Lwrite_value
- addiu $t5, -14 # bit_offset -= 9 + 5
- .word 0
+ # index = ((window >> (32 - 9 - 5)) & 31) * sizeof(uint16_t)
+ # *output = huffman_table->ac9[index]
+ srl value, window, 17
+ andi value, 62
+ addu value, huffman_table
+ lhu value, VLC_Table_ac9(value)
+ sll window, 14
+ addiu bit_offset, -14
+
+ b .Lfeed_bitstream
+ sh value, 0(output)
.Lac_prefix_0000000001:
# Prefix 0000000001 is followed by a 5-bit lookup index.
- srl $v0, $t0, 16 # index = ((window >> (32 - 10 - 5)) & 31) * sizeof(u16)
- andi $v0, 62
- addu $v0, $t8 # value = table->lut10[index]
- lhu $v0, DECDCTTAB_lut10($v0)
- sll $t0, 15 # window <<= 10 + 5
- b .Lwrite_value
- addiu $t5, -15 # bit_offset -= 10 + 5
- .word 0
+ # index = ((window >> (32 - 10 - 5)) & 31) * sizeof(uint16_t)
+ # *output = huffman_table->ac10[index]
+ srl value, window, 16
+ andi value, 62
+ addu value, huffman_table
+ lhu value, VLC_Table_ac10(value)
+ sll window, 15
+ addiu bit_offset, -15
+
+ b .Lfeed_bitstream
+ sh value, 0(output)
.Lac_prefix_00000000001:
# Prefix 00000000001 is followed by a 5-bit lookup index.
- srl $v0, $t0, 15 # index = ((window >> (32 - 11 - 5)) & 31) * sizeof(u16)
- andi $v0, 62
- addu $v0, $t8 # value = table->lut11[index]
- lhu $v0, DECDCTTAB_lut11($v0)
- sll $t0, 16 # window <<= 11 + 5
- b .Lwrite_value
- addiu $t5, -16 # bit_offset -= 11 + 5
- .word 0
+ # index = ((window >> (32 - 11 - 5)) & 31) * sizeof(uint16_t)
+ # *output = huffman_table->ac11[index]
+ srl value, window, 15
+ andi value, 62
+ addu value, huffman_table
+ lhu value, VLC_Table_ac11(value)
+ sll window, 16
+ addiu bit_offset, -16
+
+ b .Lfeed_bitstream
+ sh value, 0(output)
.Lac_prefix_000000000001:
# Prefix 000000000001 is followed by a 5-bit lookup index.
- srl $v0, $t0, 14 # index = ((window >> (32 - 12 - 5)) & 31) * sizeof(u16)
- andi $v0, 62
- addu $v0, $t8 # value = table->lut12[index]
- lhu $v0, DECDCTTAB_lut12($v0)
- sll $t0, 17 # window <<= 12 + 5
- b .Lwrite_value
- addiu $t5, -17 # bit_offset -= 12 + 5
- .word 0
-
-.Lupdate_window_and_write:
- sllv $t0, $t0, $v1 # window <<= length
- subu $t5, $v1 # bit_offset -= length
-.Lwrite_value:
- sh $v0, 0($a1)
+ # index = ((window >> (32 - 12 - 5)) & 31) * sizeof(uint16_t)
+ # *output = huffman_table->ac12[index]
+ srl value, window, 14
+ andi value, 62
+ addu value, huffman_table
+ lhu value, VLC_Table_ac12(value)
+ sll window, 17
+ addiu bit_offset, -17
+
+ b .Lfeed_bitstream
+ sh value, 0(output)
+
+.Lupdate_window_ac:
+ srl length, value, 16
+.Lupdate_window_dc:
+ sllv window, window, length
+ subu bit_offset, length
+
.Lfeed_bitstream:
# Update the window. This makes sure the next iteration of the loop will be
# able to read up to 32 bits from the bitstream.
- bgez $t5, .Lskip_feeding # if (bit_offset < 0)
- addiu $a2, -1 # max_size--
-
- subu $v0, $0, $t5 # window = next_window << (-bit_offset)
- sllv $t0, $t1, $v0
- lw $t1, 0($a3) # next_window = (*input << 16) | (*input >> 16)
- addiu $t5, 32 # bit_offset += 32
- srl $v0, $t1, 16
- sll $t1, 16
- or $t1, $v0
- addiu $a3, 4 # input++
+ bgez bit_offset, .Lskip_feeding # if (bit_offset < 0)
+ addiu max_size, -1
+
+ subu temp, $0, bit_offset # window = next_window << (-bit_offset)
+ sllv window, next_window, temp
+ lw next_window, 0(input) # next_window = (*input << 16) | (*input >> 16)
+ addiu bit_offset, 32
+ srl temp, next_window, 16
+ sll next_window, 16
+ or next_window, temp
+ addiu input, 4
.Lskip_feeding:
- srlv $v0, $t1, $t5 # window |= next_window >> bit_offset
- or $t0, $v0
+ srlv temp, next_window, bit_offset # window |= next_window >> bit_offset
+ or window, temp
- bnez $a2, .Lprocess_next_code_loop
- addiu $a1, 2 # output++
+ bnez max_size, .Lprocess_next_code_loop
+ addiu output, 2
.Lstop_processing:
# If remaining = 0, skip flushing the context, pad the output buffer with
# end-of-block codes if necessary and return 0. Otherwise flush the context
# and return 1.
- beqz $t2, .Lpad_output_buffer
- nop
-
- sw $a3, VLC_Context_input($a0)
- sw $t0, VLC_Context_window($a0)
- sw $t1, VLC_Context_next_window($a0)
- sw $t2, VLC_Context_remaining($a0)
- sh $t3, VLC_Context_quant_scale($a0)
- sb $t4, VLC_Context_is_v3($a0)
- sb $t5, VLC_Context_bit_offset($a0)
- sb $t6, VLC_Context_block_index($a0)
- sb $t7, VLC_Context_coeff_index($a0)
-
- jr $ra
+ beqz remaining, .Lpad_output_buffer
+ li temp, 0xfe00
+
+ sw input, VLC_Context_input(ctx)
+ sw window, VLC_Context_window(ctx)
+ sw next_window, VLC_Context_next_window(ctx)
+ sw remaining, VLC_Context_remaining(ctx)
+ sb bit_offset, VLC_Context_bit_offset(ctx)
+ sb block_index, VLC_Context_block_index(ctx)
+ sb coeff_index, VLC_Context_coeff_index(ctx)
+ sh last_y, VLC_Context_last_y(ctx)
+ sh last_cr, VLC_Context_last_cr(ctx)
+ sh last_cb, VLC_Context_last_cb(ctx)
+
+ b .Lreturn
li $v0, 1
.Lpad_output_buffer:
- beqz $a2, .Lreturn_zero
- li $v0, 0xfe00
-.Lpad_output_buffer_loop: # while (max_size)
- sh $v0, 0($a1) # *output = 0xfe00
- addiu $a2, -1 # max_size--
- bnez $a2, .Lpad_output_buffer_loop
- addiu $a1, 2 # output++
+ beqz max_size, .Lreturn
+ li $v0, 0
-.Lreturn_zero:
+.Lpad_output_buffer_loop: # while (max_size)
+ sh temp, 0(output)
+ addiu max_size, -1
+ bnez max_size, .Lpad_output_buffer_loop
+ addiu output, 2
+
+.Lreturn:
+ lw $s0, 0($sp)
+ lw $s1, 4($sp)
+ lw $s2, 8($sp)
+ lw $s3, 12($sp)
jr $ra
- li $v0, 0
+ addiu $sp, 16
diff --git a/libpsn00b/psxpress/vlc2.c b/libpsn00b/psxpress/vlc2.c
index 9eb99bf..24c54ce 100644
--- a/libpsn00b/psxpress/vlc2.c
+++ b/libpsn00b/psxpress/vlc2.c
@@ -63,7 +63,7 @@ static const uint32_t _compressed_table[TABLE_LENGTH] = {
static VLC_Context _default_context;
static size_t _max_buffer_size = 0;
-const DECDCTTAB2 *_vlc_huffman_table2 = 0;
+const DECDCTTAB *_vlc_huffman_table2 = 0;
/* VLC decoder */
@@ -77,14 +77,17 @@ int __attribute__((optimize(3))) DecDCTvlcContinue2(
VLC_Context *ctx, uint32_t *buf, size_t max_size
) {
const uint32_t *input = ctx->input;
- uint32_t remaining = ctx->remaining;
uint32_t window = ctx->window;
uint32_t next_window = ctx->next_window;
- uint16_t quant_scale = ctx->quant_scale;
+ uint32_t remaining = ctx->remaining;
+ int is_v3 = ctx->is_v3;
+ int bit_offset = ctx->bit_offset;
int block_index = ctx->block_index;
int coeff_index = ctx->coeff_index;
- int bit_offset = ctx->bit_offset;
- int is_v3 = ctx->is_v3;
+ uint16_t quant_scale = ctx->quant_scale;
+ int16_t last_y = ctx->last_y;
+ int16_t last_cr = ctx->last_cr;
+ int16_t last_cb = ctx->last_cb;
//if (!_vlc_huffman_table2)
//return -1;
@@ -122,13 +125,13 @@ int __attribute__((optimize(3))) DecDCTvlcContinue2(
} else if (window >> 24) {
// The first lookup table is for codes that not start with
// 00000000.
- value = _vlc_huffman_table2->lut[_get_bits_unsigned(13)];
+ value = _vlc_huffman_table2->ac[_get_bits_unsigned(13)];
_advance_window(value >> 16);
*output = (uint16_t) value;
} else {
// If the code starts with 00000000, use the second lookup
// table.
- value = _vlc_huffman_table2->lut00[_get_bits_unsigned(17)];
+ value = _vlc_huffman_table2->ac00[_get_bits_unsigned(17)];
_advance_window(value >> 16);
*output = (uint16_t) value;
}
@@ -176,12 +179,15 @@ int __attribute__((optimize(3))) DecDCTvlcContinue2(
return 0;
ctx->input = input;
- ctx->remaining = remaining;
ctx->window = window;
ctx->next_window = next_window;
+ ctx->remaining = remaining;
+ ctx->bit_offset = bit_offset;
ctx->block_index = block_index;
ctx->coeff_index = coeff_index;
- ctx->bit_offset = bit_offset;
+ ctx->last_y = last_y;
+ ctx->last_cr = last_cr;
+ ctx->last_cb = last_cb;
return 1;
}
@@ -197,21 +203,24 @@ int DecDCTvlcStart2(
return -1;
ctx->input = &input[2];
- ctx->remaining = (header->mdec0_header & 0xffff) * 2;
ctx->window = (input[0] << 16) | (input[0] >> 16);
ctx->next_window = (input[1] << 16) | (input[1] >> 16);
- ctx->quant_scale = (header->quant_scale & 63) << 10;
+ ctx->remaining = (header->mdec0_header & 0xffff) * 2;
+ ctx->is_v3 = (header->version >= 3);
+ ctx->bit_offset = 32;
ctx->block_index = 0;
ctx->coeff_index = 0;
- ctx->bit_offset = 32;
- ctx->is_v3 = (header->version == 3);
+ ctx->quant_scale = (header->quant_scale & 63) << 10;
+ ctx->last_y = 0;
+ ctx->last_cr = 0;
+ ctx->last_cb = 0;
return DecDCTvlcContinue2(ctx, buf, max_size);
}
/* Stateful VLC decoder API (for Sony SDK compatibility) */
-int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB2 *table) {
+int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB *table) {
if (table)
_vlc_huffman_table2 = table;
@@ -230,7 +239,7 @@ size_t DecDCTvlcSize2(size_t size) {
/* Lookup table decompressor */
-void DecDCTvlcBuild(DECDCTTAB2 *table) {
+void DecDCTvlcBuild(DECDCTTAB *table) {
uint32_t *output = (uint32_t *) table;
_vlc_huffman_table2 = table;
diff --git a/libpsn00b/psxspu/common.c b/libpsn00b/psxspu/common.c
index 6ccbef4..1275621 100644
--- a/libpsn00b/psxspu/common.c
+++ b/libpsn00b/psxspu/common.c
@@ -1,6 +1,6 @@
/*
* PSn00bSDK SPU library (common functions)
- * (C) 2022 spicyjpeg - MPL licensed
+ * (C) 2022-2023 spicyjpeg - MPL licensed
*/
#include <stdint.h>
@@ -32,7 +32,7 @@ static void _wait_status(uint16_t mask, uint16_t value) {
return;
}
- _sdk_log("status register timeout (0x%04x)\n", SPU_STAT);
+ _sdk_log("timeout, status=0x%04x\n", SPU_STAT);
}
static size_t _dma_transfer(uint32_t *data, size_t length, int write) {