diff options
| author | spicyjpeg <thatspicyjpeg@gmail.com> | 2023-01-23 09:36:22 +0100 |
|---|---|---|
| committer | spicyjpeg <thatspicyjpeg@gmail.com> | 2023-01-23 09:36:22 +0100 |
| commit | 09f321e37fc187affa664d32e36e32c0533a7e8e (patch) | |
| tree | 27f846c194d92a9f4f8e3daea4ff2317e3e66894 | |
| parent | a21e949c9aea98cb4b3feee48bb98579bbdfba70 (diff) | |
Add BS v3 decoding, fix MDEC API and strvideo example
| -rw-r--r-- | examples/mdec/strvideo/main.c | 74 | ||||
| -rw-r--r-- | libpsn00b/include/psxpress.h | 126 | ||||
| -rw-r--r-- | libpsn00b/psxpress/README.md | 29 | ||||
| -rw-r--r-- | libpsn00b/psxpress/mdec.c | 37 | ||||
| -rw-r--r-- | libpsn00b/psxpress/vlc.c | 160 | ||||
| -rw-r--r-- | libpsn00b/psxpress/vlc.s | 743 | ||||
| -rw-r--r-- | libpsn00b/psxpress/vlc2.c | 39 | ||||
| -rw-r--r-- | libpsn00b/psxspu/common.c | 4 |
8 files changed, 763 insertions, 449 deletions
diff --git a/examples/mdec/strvideo/main.c b/examples/mdec/strvideo/main.c index 28d39b2..57cb6ef 100644 --- a/examples/mdec/strvideo/main.c +++ b/examples/mdec/strvideo/main.c @@ -1,6 +1,6 @@ /* * PSn00bSDK .STR FMV playback example - * (C) 2022 spicyjpeg - MPL licensed + * (C) 2022-2023 spicyjpeg - MPL licensed * * This example demonstrates playback of full-motion video in the standard .STR * format, using the MDEC for frame decoding and XA for audio. Decoded frames @@ -34,9 +34,10 @@ * Playback is stopped once the .STR header is no longer present in sectors * read. * - * Note that PSn00bSDK's bitstream decoding API only supports version 1 and 2 - * bitstreams currently, so make sure your .STR files are encoded as v2 and not - * v3. + * PSn00bSDK's bitstream decoding API supports both version 2 and 3 bitstreams. + * Encoding your .STR files as v3 may result in slightly higher quality + * depending on the encoder, but also higher CPU usage during playback compared + * to the older v2. */ #include <stdint.h> @@ -102,13 +103,12 @@ void init_context(RenderContext *ctx) { FntOpen(4, 12, 312, 16, 2, 256); } -void display(RenderContext *ctx, int sync) { +void display(RenderContext *ctx) { Framebuffer *db; ctx->db_active ^= 1; DrawSync(0); - if (sync) - VSync(0); + //VSync(0); db = &(ctx->db[ctx->db_active]); PutDrawEnv(&(db->draw)); @@ -163,13 +163,13 @@ typedef struct { volatile int8_t cur_frame, cur_slice; } StreamContext; -StreamContext str_ctx; +static StreamContext str_ctx; // This buffer is used by cd_sector_handler() as a temporary area for sectors // read from the CD. Due to DMA limitations it can't be allocated on the stack // (especially not in the interrupt callbacks' stack, whose size is very // limited). -STR_Header sector_header; +static STR_Header sector_header; void cd_sector_handler(void) { StreamBuffer *frame = &str_ctx.frames[str_ctx.cur_frame]; @@ -268,7 +268,7 @@ void init_stream(void) { // optional but makes the decompressor slightly faster. See the libpsxpress // documentation for more details. DecDCTvlcSize(0x8000); - DecDCTvlcCopyTable((DECDCTTAB *) 0x1f800000); + DecDCTvlcCopyTableV3((VLC_TableV3 *) 0x1f800000); str_ctx.cur_frame = 0; str_ctx.cur_slice = 0; @@ -309,7 +309,7 @@ void start_stream(CdlFILE *file) { static RenderContext ctx; -#define SHOW_STATUS(...) { FntPrint(-1, __VA_ARGS__); FntFlush(-1); display(&ctx, 1); } +#define SHOW_STATUS(...) { FntPrint(-1, __VA_ARGS__); FntFlush(-1); display(&ctx); } #define SHOW_ERROR(...) { SHOW_STATUS(__VA_ARGS__); while (1) __asm__("nop"); } int main(int argc, const char* argv[]) { @@ -318,7 +318,7 @@ int main(int argc, const char* argv[]) { SHOW_STATUS("INITIALIZING\n"); SpuInit(); CdInit(); - InitGeom(); // Required for PSn00bSDK's DecDCTvlc() + InitGeom(); // GTE initialization required by the VLC decompressor DecDCTReset(0); SHOW_STATUS("OPENING VIDEO FILE\n"); @@ -330,8 +330,9 @@ int main(int argc, const char* argv[]) { init_stream(); start_stream(&file); - // Disable framebuffer clearing to get rid of flickering during playback. - display(&ctx, 1); + // Clear the screen, then disable framebuffer clearing to get rid of + // flickering during playback. + display(&ctx); ctx.db[0].draw.isbg = 0; ctx.db[1].draw.isbg = 0; #ifdef DISP_24BPP @@ -339,9 +340,13 @@ int main(int argc, const char* argv[]) { ctx.db[1].disp.isrgb24 = 1; #endif - int decode_errors = 0; + int frame_time = 1, decode_errors = 0; while (1) { +#ifdef DRAW_OVERLAY + int frame_start = TIMER_VALUE(1); +#endif + // Wait for a full frame to be read from the disc and decompress the // bitstream into the format expected by the MDEC. If the video has // ended, restart playback from the beginning. @@ -355,38 +360,45 @@ int main(int argc, const char* argv[]) { } #ifdef DRAW_OVERLAY - // Measure CPU usage of the decompressor using the hblank counter. - int total_time = TIMER_VALUE(1) + 1; - TIMER_VALUE(1) = 0; + int decode_time = TIMER_VALUE(1); #endif - if (DecDCTvlc(frame->bs_data, frame->mdec_data)) { + VLC_Context vlc_ctx; + if (DecDCTvlcStart( + &vlc_ctx, + frame->mdec_data, + sizeof(frame->mdec_data) / 4, + frame->bs_data + )) { decode_errors++; continue; } #ifdef DRAW_OVERLAY - int cpu_usage = TIMER_VALUE(1) * 100 / total_time; + // Calculate CPU usage of the decompressor. + decode_time = (TIMER_VALUE(1) - decode_time) & 0xffff; + int cpu_usage = decode_time * 100 / frame_time; #endif // Wait for the MDEC to finish decoding the previous frame, then flip // the framebuffers to display it and prepare the buffer for the next // frame. - // NOTE: you should *not* call VSync(0) during playback, as the refresh - // rate of the GPU is not synced to the video's frame rate. If you want - // to minimize screen tearing, consider triple buffering instead (i.e. - // always keep 2 fully decoded frames in VRAM and use VSyncCallback() - // to register a function that displays the next decoded frame whenever - // vblank occurs). + // NOTE: as the refresh rate of the GPU is not synced to the video's + // frame rate, this VSync(0) call may potentially end up waiting too + // long and desynchronizing playback. A better solution would be to + // implement triple buffering (i.e. always keep 2 fully decoded frames + // in VRAM and use VSyncCallback() to register a function that displays + // the next decoded frame if available whenever vblank occurs). + VSync(0); DecDCTinSync(0); DecDCToutSync(0); #ifdef DRAW_OVERLAY - FntPrint(-1, "FRAME:%5d READ ERRORS: %5d\n", str_ctx.frame_id, str_ctx.dropped_frames); - FntPrint(-1, "CPU: %5d%% DECODE ERRORS:%5d\n", cpu_usage, decode_errors); + FntPrint(-1, "FRAME:%6d READ ERRORS: %6d\n", str_ctx.frame_id, str_ctx.dropped_frames); + FntPrint(-1, "CPU: %6d%% DECODE ERRORS:%6d\n", cpu_usage, decode_errors); FntFlush(-1); #endif - display(&ctx, 0); + display(&ctx); // Feed the newly decompressed frame to the MDEC. The MDEC will not // actually start decoding it until an output buffer is also configured @@ -414,6 +426,10 @@ int main(int argc, const char* argv[]) { str_ctx.slices[str_ctx.cur_slice], BLOCK_SIZE * str_ctx.slice_pos.h / 2 ); + +#ifdef DRAW_OVERLAY + frame_time = (TIMER_VALUE(1) - frame_start) & 0xffff; +#endif } return 0; diff --git a/libpsn00b/include/psxpress.h b/libpsn00b/include/psxpress.h index dc1d52c..c3b13f4 100644 --- a/libpsn00b/include/psxpress.h +++ b/libpsn00b/include/psxpress.h @@ -1,6 +1,6 @@ /* * PSn00bSDK MDEC library - * (C) 2022 spicyjpeg - MPL licensed + * (C) 2022-2023 spicyjpeg - MPL licensed */ /** @@ -17,7 +17,9 @@ * FMV playback is not part of this library per se, but can implemented using * the APIs defined here alongside some code to stream data from the CD drive. * - * Currently only version 1 and 2 .BS files are supported. + * Currently bitstream versions 1, 2 and 3 are supported. Version 0 and .IKI + * bitstreams are not supported, but no encoder is publicly available for those + * anyway. */ #ifndef __PSXPRESS_H @@ -34,28 +36,26 @@ typedef struct _DECDCTENV { int16_t dct[64]; // Inverse DCT matrix (2.14 fixed-point) } DECDCTENV; -// This is the "small" lookup table used by DecDCTvlc(). It can be copied to -// the scratchpad. +typedef struct _VLC_TableV2 { + uint16_t ac0[2]; + uint32_t ac2[8], ac3[64]; + uint16_t ac4[8], ac5[8], ac7[16], ac8[32]; + uint16_t ac9[32], ac10[32], ac11[32], ac12[32]; +} VLC_TableV2; + +typedef struct _VLC_TableV3 { + uint16_t ac0[2]; + uint32_t ac2[8], ac3[64]; + uint16_t ac4[8], ac5[8], ac7[16], ac8[32]; + uint16_t ac9[32], ac10[32], ac11[32], ac12[32]; + uint8_t dc[128], dc_len[9]; + uint8_t _reserved[3]; +} VLC_TableV3; + typedef struct _DECDCTTAB { - uint16_t lut0[2]; - uint32_t lut2[8]; - uint32_t lut3[64]; - uint16_t lut4[8]; - uint16_t lut5[8]; - uint16_t lut7[16]; - uint16_t lut8[32]; - uint16_t lut9[32]; - uint16_t lut10[32]; - uint16_t lut11[32]; - uint16_t lut12[32]; + uint32_t ac[8192], ac00[512]; } DECDCTTAB; -// This is the "large" table used by DecDCTvlc2(). -typedef struct _DECDCTTAB2 { - uint32_t lut[8192]; - uint32_t lut00[512]; -} DECDCTTAB2; - typedef enum _DECDCTMODE { DECDCT_MODE_24BPP = 1, DECDCT_MODE_16BPP = 0, @@ -66,8 +66,9 @@ typedef enum _DECDCTMODE { typedef struct _VLC_Context { const uint32_t *input; uint32_t window, next_window, remaining; - uint16_t quant_scale; int8_t is_v3, bit_offset, block_index, coeff_index; + uint16_t quant_scale; + int16_t last_y, last_cr, last_cb; } VLC_Context; // Despite what some docs claim, the "number of 32-byte blocks" and "always @@ -233,8 +234,9 @@ int DecDCToutSync(int mode); * frame) into a buffer that can be passed to DecDCTin(). This function uses a * small (<1 KB) lookup table combined with the GTE to accelerate the process; * performance is roughly on par with DecDCTvlcStart2() if the lookup table - * is copied to the scratchpad beforehand by calling DecDCTvlcCopyTable(). The - * contents of the GTE's LZCR register, if any, will be destroyed. + * is copied to the scratchpad beforehand by calling DecDCTvlcCopyTableV2() or + * DecDCTvlcCopyTableV3(). The contents of the GTE's LZCS and LZCR registers, + * if any, will be destroyed. * * A VLC_Context object must be created and passed to this function, which will * then proceed to initialize its fields. The max_size argument sets the @@ -256,7 +258,7 @@ int DecDCToutSync(int mode); * @param bs * @return 0, 1 if more data needs to be output or -1 in case of failure * - * @see DecDCTvlcContinue(), DecDCTvlcCopyTable() + * @see DecDCTvlcContinue(), DecDCTvlcCopyTableV2(), DecDCTvlcCopyTableV3() */ int DecDCTvlcStart(VLC_Context *ctx, uint32_t *buf, size_t max_size, const uint32_t *bs); @@ -275,7 +277,8 @@ int DecDCTvlcStart(VLC_Context *ctx, uint32_t *buf, size_t max_size, const uint3 * context returned 0; in that case the context shall be discarded or reused to * decode another bitstream. * - * The contents of the GTE's LZCR register, if any, will be destroyed. + * The contents of the GTE's LZCS and LZCR registers, if any, will be + * destroyed. * * See DecDCTvlcStart() for more details. * @@ -309,7 +312,7 @@ int DecDCTvlcContinue(VLC_Context *ctx, uint32_t *buf, size_t max_size); * @param buf * @return 0, 1 if more data needs to be output or -1 in case of failure * - * @see DecDCTvlcSize(), DecDCTvlcCopyTable() + * @see DecDCTvlcSize(), DecDCTvlcCopyTableV2(), DecDCTvlcCopyTableV3() */ int DecDCTvlc(const uint32_t *bs, uint32_t *buf); @@ -332,23 +335,60 @@ int DecDCTvlc(const uint32_t *bs, uint32_t *buf); size_t DecDCTvlcSize(size_t size); /** - * @brief Moves the lookup table used by the .BS decompressor to the scratchpad - * region. + * @brief Copies the lookup tables used by the .BS decompressor (v1/v2) to the + * scratchpad region. + * + * @details Copies the lookup table used by DecDCTvlcContinue(), + * DecDCTvlcStart() and DecDCTvlc() to the specified address. A copy of this + * table is always present in main RAM, however this function can be used to + * copy it to the scratchpad region to boost decompression performance. + * + * This function copies a 676-byte table (VLC_TableV2 structure) containing + * only the data necessary for decoding version 1 and 2 bitstreams, to help + * save scratchpad space. If support for version 3 is required, + * DecDCTvlcCopyTableV3() can be used instead to copy the full 816-byte table. + * + * The address passed to this function is saved. Calls to DecDCTvlcStart(), + * DecDCTvlcContinue() and DecDCTvlc() will automatically use the last table + * copied. Call DecDCTvlcCopyTableV2(0) or DecDCTvlcCopyTableV3(0) to revert to + * using the library's internal table in main RAM. + * + * WARNING: attempting to decode a version 3 .BS file or .STR frame after + * calling this function will result in undefined behavior and potentially a + * crash. To re-enable version 3 decoding, use DecDCTvlcCopyTableV3() to copy + * the full table to the scratchpad or revert to using the built-in table in + * main RAM. + * + * @param addr Pointer to free 676-byte area in scratchpad region or 0 to reset + * + * @see DecDCTvlcCopyTableV3() + */ +void DecDCTvlcCopyTableV2(VLC_TableV2 *addr); + +/** + * @brief Copies the lookup tables used by the .BS decompressor (v1/v2/v3) to + * the scratchpad region. * - * @details Copies the small (<1 KB) lookup table used by DecDCTvlcContinue(), - * DecDCTvlcStart() and DecDCTvlc() (a DECDCTTAB structure) to the specified - * address. A copy of this table is always present in main RAM, however this - * function can be used to copy it to the scratchpad region to boost - * decompression performance. + * @details Copies the lookup table used by DecDCTvlcContinue(), + * DecDCTvlcStart() and DecDCTvlc() to the specified address. A copy of this + * table is always present in main RAM, however this function can be used to + * copy it to the scratchpad region to boost decompression performance. + * + * This function copies the full 816-byte table (VLC_TableV3 structure), + * including the data used to decode version 3 bitstreams. If support for + * version 3 is not required, DecDCTvlcCopyTableV2() can be used instead to + * save scratchpad space by only copying the first 676 bytes of the table. * * The address passed to this function is saved. Calls to DecDCTvlcStart(), * DecDCTvlcContinue() and DecDCTvlc() will automatically use the last table - * copied. Call DecDCTvlcCopyTable(0) to revert to using the library's internal - * table in main RAM. + * copied. Call DecDCTvlcCopyTableV2(0) or DecDCTvlcCopyTableV3(0) to revert to + * using the library's internal table in main RAM. + * + * @param addr Pointer to free 816-byte area in scratchpad region or 0 to reset * - * @param addr Pointer to free area in scratchpad region or 0 to reset + * @see DecDCTvlcCopyTableV2() */ -void DecDCTvlcCopyTable(DECDCTTAB *addr); +void DecDCTvlcCopyTableV3(VLC_TableV3 *addr); /** * @brief Decompresses or begins decompressing a .BS file into MDEC codes @@ -360,8 +400,8 @@ void DecDCTvlcCopyTable(DECDCTTAB *addr); * calling DecDCTvlcBuild(), but does not use the GTE nor the scratchpad. * Depending on the specific bitstream being decoded DecDCTvlcStart2() might be * slightly faster or slower than DecDCTvlcStart() with its lookup table copied - * to the scratchpad (see DecDCTvlcCopyTable()). DecDCTvlcStart() with the - * table in main RAM tends to be much slower. + * to the scratchpad (see DecDCTvlcCopyTableV2() and DecDCTvlcCopyTableV3()). + * DecDCTvlcStart() with the table in main RAM tends to be much slower. * * A VLC_Context object must be created and passed to this function, which will * then proceed to initialize its fields. The max_size argument sets the @@ -432,7 +472,7 @@ int DecDCTvlcContinue2(VLC_Context *ctx, uint32_t *buf, size_t max_size); * * @see DecDCTvlcSize2(), DecDCTvlcBuild() */ -int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB2 *table); +int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB *table); /** * @brief Sets the maximum amount of data to be decompressed (alternate @@ -458,7 +498,7 @@ size_t DecDCTvlcSize2(size_t size); * the .BS decompressor. * * @details Generates the lookup table required by DecDCTvlcStart2(), - * DecDCTvlcContinue2() and DecDCTvlc2() (a DECDCTTAB2 structure) into the + * DecDCTvlcContinue2() and DecDCTvlc2() (a DECDCTTAB structure) into the * specified buffer. Since the table is relatively large (34 KB), it is * recommended to only generate it in a dynamically-allocated buffer when * needed and deallocate the buffer afterwards. @@ -468,7 +508,7 @@ size_t DecDCTvlcSize2(size_t size); * * @param table */ -void DecDCTvlcBuild(DECDCTTAB2 *table); +void DecDCTvlcBuild(DECDCTTAB *table); #ifdef __cplusplus } diff --git a/libpsn00b/psxpress/README.md b/libpsn00b/psxpress/README.md index a894874..df18ec5 100644 --- a/libpsn00b/psxpress/README.md +++ b/libpsn00b/psxpress/README.md @@ -1,14 +1,19 @@ # PSn00bSDK MDEC library -This is a fully open source reimplementation of the official SDK's "data +This is a fully original reimplementation of the official SDK's "data compression" library. This library is made up of two parts, the MDEC API and functions to decompress Huffman-encoded bitstreams (.BS files, or frames in -.STR files) into data to be fed to the MDEC. FMV playback is not part of this -library (nor the official one) per se, but can implemented by using these APIs -alongside some code to stream data from the CD drive. +.STR files) into data to be fed to the MDEC. Two different implementations of +the latter are provided, one using the GTE and scratchpad region and an older +one using a large lookup table in main RAM. -**Currently only version 1 and 2 bitstreams are supported**. +FMV playback is not part of this library per se, but can implemented using the +APIs defined here alongside some code to stream data from the CD drive. + +Currently bitstream versions 1, 2 and 3 are supported. Version 0 and .IKI +bitstreams are not supported, but no encoder is publicly available for those +anyway. ## MDEC API @@ -26,14 +31,16 @@ The following functions are currently provided: - `DecDCTvlcStart()`, `DecDCTvlcContinue()`: a decompressor implementation that uses a small (<1 KB) lookup table and leverages the GTE, written in assembly. - `DecDCTvlcCopyTable()` can optionally be called to temporarily move the table - to the scratchpad region to improve decompression speed. -- `DecDCTvlcStart2()`, `DecDCTvlcContinue2()`: a different implementation using + `DecDCTvlcCopyTableV2()` or `DecDCTvlcCopyTableV3()` may optionally be called + to temporarily move the table to the scratchpad region in order to boost + decompression speed. +- `DecDCTvlcStart2()`, `DecDCTvlcContinue2()`: an older implementation using a large (34 KB) lookup table in main RAM, written in C. The table must be - decompressed ahead of time using `DecDCTvlcBuild()`, but can be deallocated - when no longer needed. + decompressed ahead of time manually using `DecDCTvlcBuild()`, but can be + deallocated when no longer needed. **This implementation does not support** + **version 3 bitstreams**. - `DecDCTvlc()`, `DecDCTvlc2()`: wrappers around the functions listed above, - for compatibility with the Sony SDK. Using them is not recommended. + for compatibility with the Sony SDK. ## SPU ADPCM encoding API diff --git a/libpsn00b/psxpress/mdec.c b/libpsn00b/psxpress/mdec.c index 3596188..394a0ce 100644 --- a/libpsn00b/psxpress/mdec.c +++ b/libpsn00b/psxpress/mdec.c @@ -1,12 +1,11 @@ /* * PSn00bSDK MDEC library (low-level MDEC/DMA API) - * (C) 2022 spicyjpeg - MPL licensed + * (C) 2022-2023 spicyjpeg - MPL licensed */ #include <stdint.h> #include <assert.h> #include <psxetc.h> -#include <psxapi.h> #include <psxpress.h> #include <hwregs_c.h> @@ -15,14 +14,14 @@ /* Default IDCT matrix and quantization tables */ -#define S0 0x5a82 // 0x4000 * cos(0/16 * pi) * sqrt(2) -#define S1 0x7d8a // 0x4000 * cos(1/16 * pi) * 2 -#define S2 0x7641 // 0x4000 * cos(2/16 * pi) * 2 -#define S3 0x6a6d // 0x4000 * cos(3/16 * pi) * 2 -#define S4 0x5a82 // 0x4000 * cos(4/16 * pi) * 2 -#define S5 0x471c // 0x4000 * cos(5/16 * pi) * 2 -#define S6 0x30fb // 0x4000 * cos(6/16 * pi) * 2 -#define S7 0x18f8 // 0x4000 * cos(7/16 * pi) * 2 +#define S0 0x5a82 // (1 << 14) * cos(0/16 * pi) * sqrt(2) +#define S1 0x7d8a // (1 << 14) * cos(1/16 * pi) * 2 +#define S2 0x7641 // (1 << 14) * cos(2/16 * pi) * 2 +#define S3 0x6a6d // (1 << 14) * cos(3/16 * pi) * 2 +#define S4 0x5a82 // (1 << 14) * cos(4/16 * pi) * 2 +#define S5 0x471c // (1 << 14) * cos(5/16 * pi) * 2 +#define S6 0x30fb // (1 << 14) * cos(6/16 * pi) * 2 +#define S7 0x18f8 // (1 << 14) * cos(7/16 * pi) * 2 static const DECDCTENV _default_mdec_env = { // The default luma and chroma quantization table is based on the MPEG-1 @@ -85,8 +84,6 @@ static const DECDCTENV _default_mdec_env = { /* Public API */ void DecDCTReset(int mode) { - FastEnterCriticalSection(); - SetDMAPriority(DMA_MDEC_IN, 3); SetDMAPriority(DMA_MDEC_OUT, 3); DMA_CHCR(DMA_MDEC_IN) = 0x00000201; // Stop DMA @@ -95,26 +92,28 @@ void DecDCTReset(int mode) { MDEC1 = 0x80000000; // Reset MDEC MDEC1 = 0x60000000; // Enable DMA in/out requests - FastExitCriticalSection(); if (!mode) DecDCTPutEnv(0, 0); } void DecDCTPutEnv(const DECDCTENV *env, int mono) { - const DECDCTENV *_env = env ? env : &_default_mdec_env; DecDCTinSync(0); + if (!env) + env = &_default_mdec_env; MDEC0 = 0x60000000; // Set IDCT matrix - DecDCTinRaw((const uint32_t *) _env->dct, 32); + DecDCTinRaw((const uint32_t *) env->dct, 32); DecDCTinSync(0); - MDEC0 = 0x40000000 | (mono ? 0 : 1); // Set table(s) - DecDCTinRaw((const uint32_t *) _env->iq_y, mono ? 16 : 32); + MDEC0 = 0x40000000 | (mono ? 0 : 1); // Set quantization table(s) + DecDCTinRaw((const uint32_t *) env->iq_y, mono ? 16 : 32); DecDCTinSync(0); } void DecDCTin(const uint32_t *data, int mode) { uint32_t header = *data; + DecDCTinSync(0); + if (mode == DECDCT_MODE_RAW) MDEC0 = header; else if (mode & DECDCT_MODE_24BPP) @@ -153,7 +152,7 @@ int DecDCTinSync(int mode) { return 0; } - _sdk_log("DecDCTinSync() timeout\n"); + _sdk_log("DecDCTinSync() timeout, MDEC1=0x%08x\n", MDEC1); return -1; } @@ -184,6 +183,6 @@ int DecDCToutSync(int mode) { return 0; } - _sdk_log("DecDCToutSync() timeout\n"); + _sdk_log("DecDCToutSync() timeout, CHCR=0x%08x\n", DMA_CHCR(DMA_MDEC_OUT)); return -1; } diff --git a/libpsn00b/psxpress/vlc.c b/libpsn00b/psxpress/vlc.c index 4e3e283..36cfbe2 100644 --- a/libpsn00b/psxpress/vlc.c +++ b/libpsn00b/psxpress/vlc.c @@ -1,6 +1,6 @@ /* * PSn00bSDK MDEC library (support code for the main VLC decompressor) - * (C) 2022 spicyjpeg - MPL licensed + * (C) 2022-2023 spicyjpeg - MPL licensed */ #include <stdint.h> @@ -10,87 +10,120 @@ /* Huffman code lookup table */ -#define _val1(rl, dc) (((rl) << 10) | ((uint16_t) (dc) & 0x3ff)) -#define _val2(rl, dc, len) (_val1(rl, dc) | (len << 16)) +#define _DC(y, c) (((y) << 4) | (c)) +#define _AC(rl, dc) (((rl) << 10) | ((uint16_t) (dc) & 0x3ff)) +#define _ACL(rl, dc, len) (_AC(rl, dc) | ((len) << 16)) -#define _pair(rl, dc) _val1(rl, dc), _val1(rl, -(dc)) -#define _pair2(rl, dc, len) _val2(rl, dc, len), _val2(rl, -(dc), len) -#define _pair3(rl, dc, len) \ - _val2(rl, dc, len), _val2(rl, dc, len), \ - _val2(rl, -(dc), len), _val2(rl, -(dc), len) -#define _pair4(rl, dc, len) \ - _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), \ - _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), \ - _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), \ - _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len) +#define _DC2(y, c) _DC(y, c), _DC(y, c) +#define _DC3(y, c) _DC(y, c), _DC(y, c), _DC(y, c), _DC(y, c) +#define _DC4(y, c) \ + _DC(y, c), _DC(y, c), _DC(y, c), _DC(y, c), \ + _DC(y, c), _DC(y, c), _DC(y, c), _DC(y, c) +#define _AC2(rl, dc) _AC(rl, dc), _AC(rl, -(dc)) +#define _ACL2(rl, dc, len) _ACL(rl, dc, len), _ACL(rl, -(dc), len) +#define _ACL3(rl, dc, len) \ + _ACL(rl, dc, len), _ACL(rl, dc, len), \ + _ACL(rl, -(dc), len), _ACL(rl, -(dc), len) +#define _ACL4(rl, dc, len) \ + _ACL(rl, dc, len), _ACL(rl, dc, len), _ACL(rl, dc, len), _ACL(rl, dc, len), \ + _ACL(rl, dc, len), _ACL(rl, dc, len), _ACL(rl, dc, len), _ACL(rl, dc, len), \ + _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), \ + _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), _ACL(rl, -(dc), len) // This table isn't compressed since it makes no sense to compress less than a // kilobyte's worth of data. -static const DECDCTTAB _default_huffman_table = { - .lut0 = { +static const VLC_TableV3 _default_huffman_table = { + .ac0 = { // 11 x - _pair( 0, 1) + _AC2( 0, 1) }, - .lut2 = { + .ac2 = { // 01 0xx - _pair2( 0, 2, 5), _pair2( 2, 1, 5), + _ACL2( 0, 2, 5), _ACL2( 2, 1, 5), // 01 1x- - _pair3( 1, 1, 4) + _ACL3( 1, 1, 4) }, - .lut3 = { + .ac3 = { // 001 00xxxx - _pair2(13, 1, 9), _pair2( 0, 6, 9), _pair2(12, 1, 9), _pair2(11, 1, 9), - _pair2( 3, 2, 9), _pair2( 1, 3, 9), _pair2( 0, 5, 9), _pair2(10, 1, 9), + _ACL2(13, 1, 9), _ACL2( 0, 6, 9), _ACL2(12, 1, 9), _ACL2(11, 1, 9), + _ACL2( 3, 2, 9), _ACL2( 1, 3, 9), _ACL2( 0, 5, 9), _ACL2(10, 1, 9), // 001 xxx--- - _pair4( 0, 3, 6), _pair4( 4, 1, 6), _pair4( 3, 1, 6) + _ACL4( 0, 3, 6), _ACL4( 4, 1, 6), _ACL4( 3, 1, 6) }, - .lut4 = { + .ac4 = { // 0001 xxx - _pair( 7, 1), _pair( 6, 1), _pair( 1, 2), _pair( 5, 1) + _AC2( 7, 1), _AC2( 6, 1), _AC2( 1, 2), _AC2( 5, 1) }, - .lut5 = { + .ac5 = { // 00001 xxx - _pair( 2, 2), _pair( 9, 1), _pair( 0, 4), _pair( 8, 1) + _AC2( 2, 2), _AC2( 9, 1), _AC2( 0, 4), _AC2( 8, 1) }, - .lut7 = { + .ac7 = { // 0000001 xxxx - _pair(16, 1), _pair( 5, 2), _pair( 0, 7), _pair( 2, 3), - _pair( 1, 4), _pair(15, 1), _pair(14, 1), _pair( 4, 2) + _AC2(16, 1), _AC2( 5, 2), _AC2( 0, 7), _AC2( 2, 3), + _AC2( 1, 4), _AC2(15, 1), _AC2(14, 1), _AC2( 4, 2) }, - .lut8 = { + .ac8 = { // 00000001 xxxxx - _pair( 0, 11), _pair( 8, 2), _pair( 4, 3), _pair( 0, 10), - _pair( 2, 4), _pair( 7, 2), _pair(21, 1), _pair(20, 1), - _pair( 0, 9), _pair(19, 1), _pair(18, 1), _pair( 1, 5), - _pair( 3, 3), _pair( 0, 8), _pair( 6, 2), _pair(17, 1) + _AC2( 0, 11), _AC2( 8, 2), _AC2( 4, 3), _AC2( 0, 10), + _AC2( 2, 4), _AC2( 7, 2), _AC2(21, 1), _AC2(20, 1), + _AC2( 0, 9), _AC2(19, 1), _AC2(18, 1), _AC2( 1, 5), + _AC2( 3, 3), _AC2( 0, 8), _AC2( 6, 2), _AC2(17, 1) }, - .lut9 = { + .ac9 = { // 000000001 xxxxx - _pair(10, 2), _pair( 9, 2), _pair( 5, 3), _pair( 3, 4), - _pair( 2, 5), _pair( 1, 7), _pair( 1, 6), _pair( 0, 15), - _pair( 0, 14), _pair( 0, 13), _pair( 0, 12), _pair(26, 1), - _pair(25, 1), _pair(24, 1), _pair(23, 1), _pair(22, 1) + _AC2(10, 2), _AC2( 9, 2), _AC2( 5, 3), _AC2( 3, 4), + _AC2( 2, 5), _AC2( 1, 7), _AC2( 1, 6), _AC2( 0, 15), + _AC2( 0, 14), _AC2( 0, 13), _AC2( 0, 12), _AC2(26, 1), + _AC2(25, 1), _AC2(24, 1), _AC2(23, 1), _AC2(22, 1) }, - .lut10 = { + .ac10 = { // 0000000001 xxxxx - _pair( 0, 31), _pair( 0, 30), _pair( 0, 29), _pair( 0, 28), - _pair( 0, 27), _pair( 0, 26), _pair( 0, 25), _pair( 0, 24), - _pair( 0, 23), _pair( 0, 22), _pair( 0, 21), _pair( 0, 20), - _pair( 0, 19), _pair( 0, 18), _pair( 0, 17), _pair( 0, 16) + _AC2( 0, 31), _AC2( 0, 30), _AC2( 0, 29), _AC2( 0, 28), + _AC2( 0, 27), _AC2( 0, 26), _AC2( 0, 25), _AC2( 0, 24), + _AC2( 0, 23), _AC2( 0, 22), _AC2( 0, 21), _AC2( 0, 20), + _AC2( 0, 19), _AC2( 0, 18), _AC2( 0, 17), _AC2( 0, 16) }, - .lut11 = { + .ac11 = { // 00000000001 xxxxx - _pair( 0, 40), _pair( 0, 39), _pair( 0, 38), _pair( 0, 37), - _pair( 0, 36), _pair( 0, 35), _pair( 0, 34), _pair( 0, 33), - _pair( 0, 32), _pair( 1, 14), _pair( 1, 13), _pair( 1, 12), - _pair( 1, 11), _pair( 1, 10), _pair( 1, 9), _pair( 1, 8) + _AC2( 0, 40), _AC2( 0, 39), _AC2( 0, 38), _AC2( 0, 37), + _AC2( 0, 36), _AC2( 0, 35), _AC2( 0, 34), _AC2( 0, 33), + _AC2( 0, 32), _AC2( 1, 14), _AC2( 1, 13), _AC2( 1, 12), + _AC2( 1, 11), _AC2( 1, 10), _AC2( 1, 9), _AC2( 1, 8) }, - .lut12 = { + .ac12 = { // 000000000001 xxxxx - _pair( 1, 18), _pair( 1, 17), _pair( 1, 16), _pair( 1, 15), - _pair( 6, 3), _pair(16, 2), _pair(15, 2), _pair(14, 2), - _pair(13, 2), _pair(12, 2), _pair(11, 2), _pair(31, 1), - _pair(30, 1), _pair(29, 1), _pair(28, 1), _pair(27, 1) + _AC2( 1, 18), _AC2( 1, 17), _AC2( 1, 16), _AC2( 1, 15), + _AC2( 6, 3), _AC2(16, 2), _AC2(15, 2), _AC2(14, 2), + _AC2(13, 2), _AC2(12, 2), _AC2(11, 2), _AC2(31, 1), + _AC2(30, 1), _AC2(29, 1), _AC2(28, 1), _AC2(27, 1) + }, + .dc = { + // 00----- + _DC4(1, 0), _DC4(1, 0), _DC4(1, 0), _DC4(1, 0), + // 01----- + _DC4(2, 1), _DC4(2, 1), _DC4(2, 1), _DC4(2, 1), + // 100---- + _DC4(0, 2), _DC4(0, 2), + // 101---- + _DC4(3, 2), _DC4(3, 2), + // 110---- + _DC4(4, 3), _DC4(4, 3), + // 1110--- + _DC4(5, 4), + // 11110-- + _DC3(6, 5), + // 111110- + _DC2(7, 6), + // 1111110 + _DC(8, 7), + // 1111111(0) + _DC(0, 8) + }, + .dc_len = { + _DC(3, 2), _DC(2, 2), _DC(2, 2), _DC(3, 3), + _DC(3, 4), _DC(4, 5), _DC(5, 6), _DC(6, 7), + _DC(7, 8) } }; @@ -100,7 +133,7 @@ static const DECDCTTAB _default_huffman_table = { static VLC_Context _default_context; static size_t _max_buffer_size = 0; -const DECDCTTAB *_vlc_huffman_table = &_default_huffman_table; +const VLC_TableV3 *_vlc_huffman_table = &_default_huffman_table; /* Stateful VLC decoder API (for Sony SDK compatibility) */ @@ -120,10 +153,19 @@ size_t DecDCTvlcSize(size_t size) { /* Lookup table relocation API */ -void DecDCTvlcCopyTable(DECDCTTAB *addr) { +void DecDCTvlcCopyTableV2(VLC_TableV2 *addr) { + if (addr) { + _vlc_huffman_table = (const VLC_TableV3 *) addr; + memcpy(addr, &_default_huffman_table, sizeof(VLC_TableV2)); + } else { + _vlc_huffman_table = &_default_huffman_table; + } +} + +void DecDCTvlcCopyTableV3(VLC_TableV3 *addr) { if (addr) { - _vlc_huffman_table = addr; - memcpy(addr, &_default_huffman_table, sizeof(DECDCTTAB)); + _vlc_huffman_table = (const VLC_TableV3 *) addr; + memcpy(addr, &_default_huffman_table, sizeof(VLC_TableV3)); } else { _vlc_huffman_table = &_default_huffman_table; } diff --git a/libpsn00b/psxpress/vlc.s b/libpsn00b/psxpress/vlc.s index f3a1c67..2de22f7 100644 --- a/libpsn00b/psxpress/vlc.s +++ b/libpsn00b/psxpress/vlc.s @@ -1,375 +1,576 @@ # PSn00bSDK MDEC library (GTE-accelerated VLC decompressor) -# (C) 2022 spicyjpeg - MPL licensed +# (C) 2022-2023 spicyjpeg - MPL licensed # -# Register map: -# - $a0 = ctx -# - $a1 = output -# - $a2 = max_size -# - $a3 = input -# - $t0 = window -# - $t1 = next_window -# - $t2 = remaining -# - $t3 = quant_scale -# - $t4 = is_v3 -# - $t5 = bit_offset -# - $t6 = block_index -# - $t7 = coeff_index -# - $t8 = _vlc_huffman_table -# - $t9 = &ac_jump_area +# TODO: reduce the size of the v3 DC coefficient decoder; currently the code is +# duplicated for each block type, but it can probably be shortened with no +# performance impact... -.set noreorder +.include "gtereg.inc" -.set VLC_Context_input, 0 -.set VLC_Context_window, 4 -.set VLC_Context_next_window, 8 -.set VLC_Context_remaining, 12 -.set VLC_Context_quant_scale, 16 -.set VLC_Context_is_v3, 18 -.set VLC_Context_bit_offset, 19 -.set VLC_Context_block_index, 20 -.set VLC_Context_coeff_index, 21 - -.set DECDCTTAB_lut0, 0 -.set DECDCTTAB_lut2, 4 -.set DECDCTTAB_lut3, 36 -.set DECDCTTAB_lut4, 292 -.set DECDCTTAB_lut5, 308 -.set DECDCTTAB_lut7, 324 -.set DECDCTTAB_lut8, 356 -.set DECDCTTAB_lut9, 420 -.set DECDCTTAB_lut10, 484 -.set DECDCTTAB_lut11, 548 -.set DECDCTTAB_lut12, 612 +.set noreorder +.set noat + +.set value, $v0 +.set length, $v1 +.set ctx, $a0 +.set output, $a1 +.set max_size, $a2 +.set input, $a3 +.set temp, $t0 +.set window, $t1 +.set next_window, $t2 +.set remaining, $t3 +.set is_v3, $t4 +.set bit_offset, $t5 +.set block_index, $t6 +.set coeff_index, $t7 +.set quant_scale, $s0 +.set last_y, $s1 +.set last_cr, $s2 +.set last_cb, $s3 +.set huffman_table, $t8 +.set ac_jump_area, $t9 + +.set VLC_Context_input, 0x0 +.set VLC_Context_window, 0x4 +.set VLC_Context_next_window, 0x8 +.set VLC_Context_remaining, 0xc +.set VLC_Context_is_v3, 0x10 +.set VLC_Context_bit_offset, 0x11 +.set VLC_Context_block_index, 0x12 +.set VLC_Context_coeff_index, 0x13 +.set VLC_Context_quant_scale, 0x14 +.set VLC_Context_last_y, 0x16 +.set VLC_Context_last_cr, 0x18 +.set VLC_Context_last_cb, 0x1a + +.set VLC_Table_ac0, 0x0 +.set VLC_Table_ac2, 0x4 +.set VLC_Table_ac3, 0x24 +.set VLC_Table_ac4, 0x124 +.set VLC_Table_ac5, 0x134 +.set VLC_Table_ac7, 0x144 +.set VLC_Table_ac8, 0x164 +.set VLC_Table_ac9, 0x1a4 +.set VLC_Table_ac10, 0x1e4 +.set VLC_Table_ac11, 0x224 +.set VLC_Table_ac12, 0x264 +.set VLC_Table_dc, 0x2a4 +.set VLC_Table_dc_len, 0x324 .section .text.DecDCTvlcStart .global DecDCTvlcStart .type DecDCTvlcStart, @function DecDCTvlcStart: + addiu $sp, -16 + sw $s0, 0($sp) + sw $s1, 4($sp) + sw $s2, 8($sp) + sw $s3, 12($sp) + # Create a new context on-the-fly without writing it to memory then jump # into DecDCTvlcContinue(), skipping context loading. - lw $t0, 8($a3) # window = (bs->data[0] << 16) | (bs->data[0] >> 16) - nop - srl $v0, $t0, 16 - sll $t0, 16 - - lw $t1, 12($a3) # next_window = (bs->data[1] << 16) | (bs->data[1] >> 16) - or $t0, $v0 - srl $v0, $t1, 16 - sll $t1, 16 - - lhu $t2, 0($a3) # remaining = bs->uncomp_length * 2 - or $t1, $v0 - - lhu $t3, 4($a3) # quant_scale = (bs->quant_scale & 63) << 10 - sll $t2, 1 - andi $t3, 63 - - lhu $t4, 6($a3) # is_v3 = !(bs->version < 3) - sll $t3, 10 - sltiu $t4, $t4, 3 - xori $t4, 1 - - li $t5, 32 # bit_offset = 32 - li $t6, 5 # block_index = 5 - li $t7, 0 # coeff_index = 0 + lw window, 8(input) # window = (bs->data[0] << 16) | (bs->data[0] >> 16) + li last_y, 0 + srl temp, window, 16 + sll window, 16 + or window, temp + + # next_window = (bs->data[1] << 16) | (bs->data[1] >> 16) + lw next_window, 12(input) + li last_cr, 0 + srl temp, next_window, 16 + sll next_window, 16 + or next_window, temp + + lhu remaining, 0(input) # remaining = bs->uncomp_length * 2 + li last_cb, 0 + sll remaining, 1 + + lw temp, 4(input) # quant_scale = (bs->quant_scale & 63) << 10 + li bit_offset, 32 + andi quant_scale, temp, 63 + sll quant_scale, 10 + + srl temp, 16 # is_v3 = !(bs->version < 3) + sltiu is_v3, temp, 3 + xori is_v3, 1 + + li block_index, 5 + li coeff_index, 0 j _vlc_skip_context_load - addiu $a3, 16 # input = &(bs->data[2]) + addiu input, 16 # input = &(bs->data[2]) .section .text.DecDCTvlcContinue .global DecDCTvlcContinue .type DecDCTvlcContinue, @function DecDCTvlcContinue: - lw $a3, VLC_Context_input($a0) - lw $t0, VLC_Context_window($a0) - lw $t1, VLC_Context_next_window($a0) - lw $t2, VLC_Context_remaining($a0) - lhu $t3, VLC_Context_quant_scale($a0) - lb $t4, VLC_Context_is_v3($a0) - lb $t5, VLC_Context_bit_offset($a0) - lb $t6, VLC_Context_block_index($a0) - lb $t7, VLC_Context_coeff_index($a0) + addiu $sp, -16 + sw $s0, 0($sp) + sw $s1, 4($sp) + sw $s2, 8($sp) + sw $s3, 12($sp) + + lw input, VLC_Context_input(ctx) + lw window, VLC_Context_window(ctx) + lw next_window, VLC_Context_next_window(ctx) + lw remaining, VLC_Context_remaining(ctx) + lb is_v3, VLC_Context_is_v3(ctx) + lb bit_offset, VLC_Context_bit_offset(ctx) + lb block_index, VLC_Context_block_index(ctx) + lb coeff_index, VLC_Context_coeff_index(ctx) + lhu quant_scale, VLC_Context_quant_scale(ctx) + lh last_y, VLC_Context_last_y(ctx) + lh last_cr, VLC_Context_last_cr(ctx) + lh last_cb, VLC_Context_last_cb(ctx) _vlc_skip_context_load: - # Determine how many bytes to output. This whole block of code basically - # does this: + # Determine how many bytes to output. + # if (max_size <= 0) max_size = 0x3fff0000 # max_size = min((max_size - 1) * 2, remaining) # remaining -= max_size - bgtz $a2, .Lmax_size_valid # if (max_size <= 0) max_size = 0x7ffe0000 - addiu $a2, -1 # else max_size = (max_size - 1) * 2 - lui $a2, 0x3fff + bgtz max_size, .Lmax_size_valid + addiu max_size, -1 + lui max_size, 0x3fff .Lmax_size_valid: - sll $a2, 1 + sll max_size, 1 - blt $a2, $t2, .Lmax_size_ok # if (max_size > remaining) max_size = remaining - lui $v1, 0x3800 - move $a2, $t2 -.Lmax_size_ok: - subu $t2, $a2 # remaining -= max_size + subu remaining, max_size + bgez remaining, .Lmax_size_ok + lui temp, 0x3800 + addu max_size, remaining + li remaining, 0 + +.Lmax_size_ok: # Write the length of the data that will be decoded to first 4 bytes of the # output buffer, which will be then parsed by DecDCTin(). - srl $v0, $a2, 1 # output[0] = 0x38000000 | (max_size / 2) - or $v0, $v1 - sw $v0, 0($a1) + srl value, max_size, 1 # output[0] = 0x38000000 | (max_size / 2) + or value, temp + sw value, 0(output) # Obtain the addresses of the lookup table and jump area in advance so that # they don't have to be retrieved for each coefficient decoded. - lw $t8, _vlc_huffman_table - la $t9, .Lac_prefix_10 + lw huffman_table, _vlc_huffman_table + la ac_jump_area, .Lac_prefix_01 - 32 - beqz $a2, .Lstop_processing - addiu $a1, 4 # output = (uint16_t *) &output[1] + beqz max_size, .Lstop_processing + addiu output, 4 .Lprocess_next_code_loop: # while (max_size) # This is the "hot" part of the decoder, executed for each code in the # bitstream. The first step is to determine if the next code is a DC or AC - # coefficient. - bnez $t7, .Lprocess_ac_coefficient - addiu $t7, 1 # coeff_index++ - bnez $t4, .Lprocess_dc_v3_coefficient - li $v1, 0x01ff + # coefficient; at the same time the GTE is given the task of counting the + # number of leading zeroes/ones in the code (which takes 2 more cycles). + mtc2 window, C2_LZCS + + bnez coeff_index, .Lprocess_ac_coefficient + addiu coeff_index, 1 + bnez is_v3, .Lprocess_dc_v3_coefficient + li temp, 0x1ff .Lprocess_dc_v2_coefficient: # if (!coeff_index && !is_v3) # The DC coefficient in version 2 frames is not compressed. Value 0x1ff is # used to signal the end of the bitstream. - srl $v0, $t0, 22 # prefix = (window >> (32 - 10)) - beq $v0, $v1, .Lstop_processing # if (prefix == 0x1ff) break - or $v0, $t3 # *output = prefix | quant_scale - sll $t0, 10 # window <<= 10 - b .Lwrite_value - addiu $t5, -10 # bit_offset -= 10 + # prefix = window >> (32 - 10) + # if (prefix == 0x1ff) break + # *output = prefix | quant_scale + srl value, window, 22 + beq value, temp, .Lstop_processing + or value, quant_scale + sll window, 10 + addiu bit_offset, -10 + + b .Lfeed_bitstream + sh value, 0(output) .Lprocess_dc_v3_coefficient: # if (!coeff_index && is_v3) - # TODO: version 3 is currently not supported. - jr $ra - li $v0, -1 - -.Lprocess_ac_coefficient: # if (coeff_index) - # Check whether the prefix code is one of the shorter, more common ones, - # and start counting the number of leading zeroes/ones using the GTE (which - # takes 2 more cycles). - srl $v0, $t0, 30 - li $v1, 3 - beq $v0, $v1, .Lac_prefix_11 - li $v1, 2 - beq $v0, $v1, .Lac_prefix_10 - li $v1, 1 - mtc2 $t0, $30 - beq $v0, $v1, .Lac_prefix_01 + # Version 3 DC coefficients are variable-length deltas, prefixed with a + # Huffman code indicating their length. Since the prefix code is up to 7 + # bits long, it makes sense to decode it with a simple 128-byte lookup + # table rather than using the GTE. The codes are different for luma and + # chroma blocks, so each table entry contains the decoded length for both + # block types (packed as two nibbles). Prefix 111111111 is used to signal + # the end of the bitstream. + # prefix = window >> (32 - 9) + # if (prefix == 0x1ff) break + # lengths = huffman_table->dc[prefix >> 2] + srl length, window, 23 + beq length, temp, .Lstop_processing + srl length, 2 + addu length, huffman_table + + addiu $at, block_index, -4 + bltz $at, .Ldc_block_y + lbu length, VLC_Table_dc(length) + beqz $at, .Ldc_block_cb + andi length, 15 # if (block_index >= Cb) dc_length = lengths & 15 + +.Ldc_block_cr: # if (block_index > Cb) + # prefix_length = huffman_table->dc_len[dc_length] & 15 + addu temp, length, huffman_table + lbu temp, VLC_Table_dc_len(temp) + li $at, 32 + andi temp, 15 + + sllv window, window, temp + beqz length, .Ldc_cr_zero # if (dc_length) + subu bit_offset, temp + + subu $at, length # value = window >> (32 - dc_length) + srlv value, window, $at + + # Decode the sign bit, then add the decoded delta to the current value. + # if (!(window >> 31)) value -= (1 << dc_length) - 1 + bltz window, .Ldc_cr_positive + li temp, -1 + srlv temp, temp, $at + subu value, temp +.Ldc_cr_positive: + addu last_cr, value + andi last_cr, 0x3ff + +.Ldc_cr_zero: + sll temp, last_cr, 2 # *output = (last_cr << 2) | quant_scale + or temp, quant_scale + b .Lupdate_window_dc # update_window(dc_length) + sh temp, 0(output) + +.Ldc_block_cb: # if (block_index == Cb) + # prefix_length = huffman_table->dc_len[dc_length] & 15 + addu temp, length, huffman_table + lbu temp, VLC_Table_dc_len(temp) + li $at, 32 + andi temp, 15 + + sllv window, window, temp + beqz length, .Ldc_cb_zero # if (dc_length) + subu bit_offset, temp + + subu $at, length # value = window >> (32 - dc_length) + srlv value, window, $at + + # Decode the sign bit, then add the decoded delta to the current value. + # if (!(window >> 31)) value -= (1 << dc_length) - 1 + bltz window, .Ldc_cb_positive + li temp, -1 + srlv temp, temp, $at + subu value, temp +.Ldc_cb_positive: + addu last_cb, value + andi last_cb, 0x3ff + +.Ldc_cb_zero: + sll value, last_cb, 2 # *output = (last_cb << 2) | quant_scale + or value, quant_scale + b .Lupdate_window_dc # update_window(dc_length) + sh value, 0(output) + +.Ldc_block_y: # if (block_index < Cb) nop + srl length, 4 # dc_length = lengths >> 4 + + # prefix_length = huffman_table->dc_len[dc_length] >> 4 + addu temp, length, huffman_table + lbu temp, VLC_Table_dc_len(temp) + li $at, 32 + srl temp, 4 + + sllv window, window, temp + beqz length, .Ldc_y_zero # if (dc_length) + subu bit_offset, temp + + sll temp, last_y, 2 + subu $at, length # value = window >> (32 - dc_length) + srlv value, window, $at + + # Decode the sign bit, then add the decoded delta to the current value. + # if (!(window >> 31)) value -= (1 << dc_length) - 1 + bltz window, .Ldc_y_positive + li temp, -1 + srlv temp, temp, $at + subu value, temp +.Ldc_y_positive: + addu last_y, value + andi last_y, 0x3ff + +.Ldc_y_zero: + sll temp, last_y, 2 # *output = (last_y << 2) | quant_scale + or temp, quant_scale + b .Lupdate_window_dc # update_window(dc_length) + sh temp, 0(output) - # If the code is longer, retrieve the number of leading zeroes from the GTE - # and use it as an index into the jump area. Each block in the area is 8 - # instructions long and handles decoding a specific prefix. - mfc2 $v0, $31 - li $v1, 11 - bgt $v0, $v1, .Lreturn_error # if (prefix > 11) return -1 - sll $v0, 5 # jump_addr = &ac_jump_area[prefix * 8 * sizeof(u32)] - addu $v0, $t9 - jr $v0 +.Lprocess_ac_coefficient: # if (coeff_index) + # Check whether the prefix code is 10 or 11 (i.e. if it starts with 1). If + # not, retrieve the number of leading zeroes from the GTE and use it as an + # index into the jump area. Each block in the area is 8 instructions long + # and handles decoding a specific prefix. + mfc2 temp, C2_LZCR + + bltz window, .Lac_prefix_1 # if (!(window >> 31)) + addiu $at, temp, -11 # if (prefix > 11) return -1 + bgtz $at, .Lreturn_error + sll temp, 5 # jump_addr = &ac_jump_area[prefix * 8 * sizeof(uint32_t)] + addu temp, ac_jump_area + jr temp nop .Lreturn_error: - jr $ra + b .Lreturn li $v0, -1 -.Lac_prefix_11: - # Prefix 11 is followed by a single bit. - srl $v0, $t0, 28 # index = ((window >> (32 - 2 - 1)) & 1) * sizeof(u16) - andi $v0, 2 - addu $v0, $t8 # value = table->lut0[index] - lhu $v0, DECDCTTAB_lut0($v0) - sll $t0, 3 # window <<= 3 - b .Lwrite_value - addiu $t5, -3 # bit_offset -= 3 - #.word 0 +.Lac_prefix_1: # if (window >> 31) + sll window, 1 + bltz window, .Lac_prefix_11 + li temp, 0xfe00 .Lac_prefix_10: # Prefix 10 marks the end of a block. - li $v0, 0xfe00 # value = 0xfe00 - sll $t0, 2 # window <<= 2 - addiu $t5, -2 # bit_offset -= 2 - addiu $t6, -1 # block_index-- - bgez $t6, .Lwrite_value - li $t7, 0 # coeff_index = 0 - b .Lwrite_value - li $t6, 5 # if (block_index < 0) block_index = 5 + # *output = 0xfe00 + # coeff_index = 0 + # if (--block_index < Y3) block_index = Cr + sll window, 1 + addiu bit_offset, -2 + sh temp, 0(output) + + addiu block_index, -1 + bgez block_index, .Lfeed_bitstream + li coeff_index, 0 + b .Lfeed_bitstream + li block_index, 5 + +.Lac_prefix_11: + # Prefix 11 is followed by a single bit. Note that the 10/11 prefix check + # already shifts the window by one bit (without updating the bit offset). + # index = ((window >> (32 - 1 - 1)) & 1) * sizeof(uint16_t) + # *output = huffman_table->ac0[index] + srl value, window, 29 + andi value, 2 + addu value, huffman_table + lhu value, VLC_Table_ac0(value) + sll window, 2 + addiu bit_offset, -3 + + b .Lfeed_bitstream + sh value, 0(output) .Lac_prefix_01: # Prefix 01 can be followed by a 2-bit lookup index starting with 1, or a # 3-bit lookup index starting with 0. A 32-bit lookup table is used, # containing both MDEC codes and lengths. - srl $v0, $t0, 25 # index = ((window >> (32 - 2 - 3)) & 7) * sizeof(u32) - andi $v0, 28 - addu $v0, $t8 # value = table->lut2[index] - lw $v0, DECDCTTAB_lut2($v0) - b .Lupdate_window_and_write - srl $v1, $v0, 16 # length = value >> 16 + # index = ((window >> (32 - 2 - 3)) & 7) * sizeof(uint32_t) + # *output = huffman_table->ac2[index] & 0xffff + # length = huffman_table->ac2[index] >> 16 + srl value, window, 25 + andi value, 28 + addu value, huffman_table + lw value, VLC_Table_ac2(value) + + b .Lupdate_window_ac # update_window(value >> 16) + sh value, 0(output) .word 0, 0 .Lac_prefix_001: # Prefix 001 can be followed by a 6-bit lookup index starting with 00, or a # 3-bit lookup index starting with 01/10/11. - srl $v0, $t0, 21 # index = ((window >> (32 - 3 - 6)) & 63) * sizeof(u32) - andi $v0, 252 - addu $v0, $t8 # value = table->lut3[index] - lw $v0, DECDCTTAB_lut3($v0) - b .Lupdate_window_and_write - srl $v1, $v0, 16 # length = value >> 16 + # index = ((window >> (32 - 3 - 6)) & 63) * sizeof(uint32_t) + # *output = huffman_table->ac3[index] & 0xffff + # length = huffman_table->ac3[index] >> 16 + srl value, window, 21 + andi value, 252 + addu value, huffman_table + lw value, VLC_Table_ac3(value) + + b .Lupdate_window_ac # update_window(value >> 16) + sh value, 0(output) .word 0, 0 .Lac_prefix_0001: # Prefix 0001 is followed by a 3-bit lookup index. - srl $v0, $t0, 24 # index = ((window >> (32 - 4 - 3)) & 7) * sizeof(u16) - andi $v0, 14 - addu $v0, $t8 # value = table->lut4[index] - lhu $v0, DECDCTTAB_lut4($v0) - sll $t0, 7 # window <<= 4 + 3 - b .Lwrite_value - addiu $t5, -7 # bit_offset -= 4 + 3 - .word 0 + # index = ((window >> (32 - 4 - 3)) & 7) * sizeof(uint16_t) + # *output = huffman_table->ac4[index] + srl value, window, 24 + andi value, 14 + addu value, huffman_table + lhu value, VLC_Table_ac4(value) + sll window, 7 + addiu bit_offset, -7 + + b .Lfeed_bitstream + sh value, 0(output) .Lac_prefix_00001: # Prefix 00001 is followed by a 3-bit lookup index. - srl $v0, $t0, 23 # index = ((window >> (32 - 5 - 3)) & 7) * sizeof(u16) - andi $v0, 14 - addu $v0, $t8 # value = table->lut5[index] - lhu $v0, DECDCTTAB_lut5($v0) - sll $t0, 8 # window <<= 5 + 3 - b .Lwrite_value - addiu $t5, -8 # bit_offset -= 5 + 3 - .word 0 + # index = ((window >> (32 - 5 - 3)) & 7) * sizeof(uint16_t) + # *output = huffman_table->ac5[index] + srl value, window, 23 + andi value, 14 + addu value, huffman_table + lhu value, VLC_Table_ac5(value) + sll window, 8 + addiu bit_offset, -8 + + b .Lfeed_bitstream + sh value, 0(output) .Lac_prefix_000001: # Prefix 000001 is an escape code followed by a full 16-bit MDEC value. - srl $v0, $t0, 10 # value = window >> (32 - 6 - 16) - sll $t0, 22 # window <<= 6 + 16 - b .Lwrite_value - addiu $t5, -22 # bit_offset -= 6 + 16 - .word 0, 0, 0, 0 + # *output = window >> (32 - 6 - 16) + srl value, window, 10 + sll window, 22 + addiu bit_offset, -22 + + b .Lfeed_bitstream + sh value, 0(output) + .word 0, 0, 0 .Lac_prefix_0000001: # Prefix 0000001 is followed by a 4-bit lookup index. - srl $v0, $t0, 20 # index = ((window >> (32 - 7 - 4)) & 15) * sizeof(u16) - andi $v0, 30 - addu $v0, $t8 # value = table->lut7[index] - lhu $v0, DECDCTTAB_lut7($v0) - sll $t0, 11 # window <<= 7 + 4 - b .Lwrite_value - addiu $t5, -11 # bit_offset -= 7 + 4 - .word 0 + # index = ((window >> (32 - 7 - 4)) & 15) * sizeof(uint16_t) + # *output = huffman_table->ac7[index] + srl value, window, 20 + andi value, 30 + addu value, huffman_table + lhu value, VLC_Table_ac7(value) + sll window, 11 + addiu bit_offset, -11 + + b .Lfeed_bitstream + sh value, 0(output) .Lac_prefix_00000001: # Prefix 00000001 is followed by a 5-bit lookup index. - srl $v0, $t0, 18 # index = ((window >> (32 - 8 - 5)) & 31) * sizeof(u16) - andi $v0, 62 - addu $v0, $t8 # value = table->lut8[index] - lhu $v0, DECDCTTAB_lut8($v0) - sll $t0, 13 # window <<= 8 + 5 - b .Lwrite_value - addiu $t5, -13 # bit_offset -= 8 + 5 - .word 0 + # index = ((window >> (32 - 8 - 5)) & 31) * sizeof(uint16_t) + # *output = huffman_table->ac8[index] + srl value, window, 18 + andi value, 62 + addu value, huffman_table + lhu value, VLC_Table_ac8(value) + sll window, 13 + addiu bit_offset, -13 + + b .Lfeed_bitstream + sh value, 0(output) .Lac_prefix_000000001: # Prefix 000000001 is followed by a 5-bit lookup index. - srl $v0, $t0, 17 # index = ((window >> (32 - 9 - 5)) & 31) * sizeof(u16) - andi $v0, 62 - addu $v0, $t8 # value = table->lut9[index] - lhu $v0, DECDCTTAB_lut9($v0) - sll $t0, 14 # window <<= 9 + 5 - b .Lwrite_value - addiu $t5, -14 # bit_offset -= 9 + 5 - .word 0 + # index = ((window >> (32 - 9 - 5)) & 31) * sizeof(uint16_t) + # *output = huffman_table->ac9[index] + srl value, window, 17 + andi value, 62 + addu value, huffman_table + lhu value, VLC_Table_ac9(value) + sll window, 14 + addiu bit_offset, -14 + + b .Lfeed_bitstream + sh value, 0(output) .Lac_prefix_0000000001: # Prefix 0000000001 is followed by a 5-bit lookup index. - srl $v0, $t0, 16 # index = ((window >> (32 - 10 - 5)) & 31) * sizeof(u16) - andi $v0, 62 - addu $v0, $t8 # value = table->lut10[index] - lhu $v0, DECDCTTAB_lut10($v0) - sll $t0, 15 # window <<= 10 + 5 - b .Lwrite_value - addiu $t5, -15 # bit_offset -= 10 + 5 - .word 0 + # index = ((window >> (32 - 10 - 5)) & 31) * sizeof(uint16_t) + # *output = huffman_table->ac10[index] + srl value, window, 16 + andi value, 62 + addu value, huffman_table + lhu value, VLC_Table_ac10(value) + sll window, 15 + addiu bit_offset, -15 + + b .Lfeed_bitstream + sh value, 0(output) .Lac_prefix_00000000001: # Prefix 00000000001 is followed by a 5-bit lookup index. - srl $v0, $t0, 15 # index = ((window >> (32 - 11 - 5)) & 31) * sizeof(u16) - andi $v0, 62 - addu $v0, $t8 # value = table->lut11[index] - lhu $v0, DECDCTTAB_lut11($v0) - sll $t0, 16 # window <<= 11 + 5 - b .Lwrite_value - addiu $t5, -16 # bit_offset -= 11 + 5 - .word 0 + # index = ((window >> (32 - 11 - 5)) & 31) * sizeof(uint16_t) + # *output = huffman_table->ac11[index] + srl value, window, 15 + andi value, 62 + addu value, huffman_table + lhu value, VLC_Table_ac11(value) + sll window, 16 + addiu bit_offset, -16 + + b .Lfeed_bitstream + sh value, 0(output) .Lac_prefix_000000000001: # Prefix 000000000001 is followed by a 5-bit lookup index. - srl $v0, $t0, 14 # index = ((window >> (32 - 12 - 5)) & 31) * sizeof(u16) - andi $v0, 62 - addu $v0, $t8 # value = table->lut12[index] - lhu $v0, DECDCTTAB_lut12($v0) - sll $t0, 17 # window <<= 12 + 5 - b .Lwrite_value - addiu $t5, -17 # bit_offset -= 12 + 5 - .word 0 - -.Lupdate_window_and_write: - sllv $t0, $t0, $v1 # window <<= length - subu $t5, $v1 # bit_offset -= length -.Lwrite_value: - sh $v0, 0($a1) + # index = ((window >> (32 - 12 - 5)) & 31) * sizeof(uint16_t) + # *output = huffman_table->ac12[index] + srl value, window, 14 + andi value, 62 + addu value, huffman_table + lhu value, VLC_Table_ac12(value) + sll window, 17 + addiu bit_offset, -17 + + b .Lfeed_bitstream + sh value, 0(output) + +.Lupdate_window_ac: + srl length, value, 16 +.Lupdate_window_dc: + sllv window, window, length + subu bit_offset, length + .Lfeed_bitstream: # Update the window. This makes sure the next iteration of the loop will be # able to read up to 32 bits from the bitstream. - bgez $t5, .Lskip_feeding # if (bit_offset < 0) - addiu $a2, -1 # max_size-- - - subu $v0, $0, $t5 # window = next_window << (-bit_offset) - sllv $t0, $t1, $v0 - lw $t1, 0($a3) # next_window = (*input << 16) | (*input >> 16) - addiu $t5, 32 # bit_offset += 32 - srl $v0, $t1, 16 - sll $t1, 16 - or $t1, $v0 - addiu $a3, 4 # input++ + bgez bit_offset, .Lskip_feeding # if (bit_offset < 0) + addiu max_size, -1 + + subu temp, $0, bit_offset # window = next_window << (-bit_offset) + sllv window, next_window, temp + lw next_window, 0(input) # next_window = (*input << 16) | (*input >> 16) + addiu bit_offset, 32 + srl temp, next_window, 16 + sll next_window, 16 + or next_window, temp + addiu input, 4 .Lskip_feeding: - srlv $v0, $t1, $t5 # window |= next_window >> bit_offset - or $t0, $v0 + srlv temp, next_window, bit_offset # window |= next_window >> bit_offset + or window, temp - bnez $a2, .Lprocess_next_code_loop - addiu $a1, 2 # output++ + bnez max_size, .Lprocess_next_code_loop + addiu output, 2 .Lstop_processing: # If remaining = 0, skip flushing the context, pad the output buffer with # end-of-block codes if necessary and return 0. Otherwise flush the context # and return 1. - beqz $t2, .Lpad_output_buffer - nop - - sw $a3, VLC_Context_input($a0) - sw $t0, VLC_Context_window($a0) - sw $t1, VLC_Context_next_window($a0) - sw $t2, VLC_Context_remaining($a0) - sh $t3, VLC_Context_quant_scale($a0) - sb $t4, VLC_Context_is_v3($a0) - sb $t5, VLC_Context_bit_offset($a0) - sb $t6, VLC_Context_block_index($a0) - sb $t7, VLC_Context_coeff_index($a0) - - jr $ra + beqz remaining, .Lpad_output_buffer + li temp, 0xfe00 + + sw input, VLC_Context_input(ctx) + sw window, VLC_Context_window(ctx) + sw next_window, VLC_Context_next_window(ctx) + sw remaining, VLC_Context_remaining(ctx) + sb bit_offset, VLC_Context_bit_offset(ctx) + sb block_index, VLC_Context_block_index(ctx) + sb coeff_index, VLC_Context_coeff_index(ctx) + sh last_y, VLC_Context_last_y(ctx) + sh last_cr, VLC_Context_last_cr(ctx) + sh last_cb, VLC_Context_last_cb(ctx) + + b .Lreturn li $v0, 1 .Lpad_output_buffer: - beqz $a2, .Lreturn_zero - li $v0, 0xfe00 -.Lpad_output_buffer_loop: # while (max_size) - sh $v0, 0($a1) # *output = 0xfe00 - addiu $a2, -1 # max_size-- - bnez $a2, .Lpad_output_buffer_loop - addiu $a1, 2 # output++ + beqz max_size, .Lreturn + li $v0, 0 -.Lreturn_zero: +.Lpad_output_buffer_loop: # while (max_size) + sh temp, 0(output) + addiu max_size, -1 + bnez max_size, .Lpad_output_buffer_loop + addiu output, 2 + +.Lreturn: + lw $s0, 0($sp) + lw $s1, 4($sp) + lw $s2, 8($sp) + lw $s3, 12($sp) jr $ra - li $v0, 0 + addiu $sp, 16 diff --git a/libpsn00b/psxpress/vlc2.c b/libpsn00b/psxpress/vlc2.c index 9eb99bf..24c54ce 100644 --- a/libpsn00b/psxpress/vlc2.c +++ b/libpsn00b/psxpress/vlc2.c @@ -63,7 +63,7 @@ static const uint32_t _compressed_table[TABLE_LENGTH] = { static VLC_Context _default_context; static size_t _max_buffer_size = 0; -const DECDCTTAB2 *_vlc_huffman_table2 = 0; +const DECDCTTAB *_vlc_huffman_table2 = 0; /* VLC decoder */ @@ -77,14 +77,17 @@ int __attribute__((optimize(3))) DecDCTvlcContinue2( VLC_Context *ctx, uint32_t *buf, size_t max_size ) { const uint32_t *input = ctx->input; - uint32_t remaining = ctx->remaining; uint32_t window = ctx->window; uint32_t next_window = ctx->next_window; - uint16_t quant_scale = ctx->quant_scale; + uint32_t remaining = ctx->remaining; + int is_v3 = ctx->is_v3; + int bit_offset = ctx->bit_offset; int block_index = ctx->block_index; int coeff_index = ctx->coeff_index; - int bit_offset = ctx->bit_offset; - int is_v3 = ctx->is_v3; + uint16_t quant_scale = ctx->quant_scale; + int16_t last_y = ctx->last_y; + int16_t last_cr = ctx->last_cr; + int16_t last_cb = ctx->last_cb; //if (!_vlc_huffman_table2) //return -1; @@ -122,13 +125,13 @@ int __attribute__((optimize(3))) DecDCTvlcContinue2( } else if (window >> 24) { // The first lookup table is for codes that not start with // 00000000. - value = _vlc_huffman_table2->lut[_get_bits_unsigned(13)]; + value = _vlc_huffman_table2->ac[_get_bits_unsigned(13)]; _advance_window(value >> 16); *output = (uint16_t) value; } else { // If the code starts with 00000000, use the second lookup // table. - value = _vlc_huffman_table2->lut00[_get_bits_unsigned(17)]; + value = _vlc_huffman_table2->ac00[_get_bits_unsigned(17)]; _advance_window(value >> 16); *output = (uint16_t) value; } @@ -176,12 +179,15 @@ int __attribute__((optimize(3))) DecDCTvlcContinue2( return 0; ctx->input = input; - ctx->remaining = remaining; ctx->window = window; ctx->next_window = next_window; + ctx->remaining = remaining; + ctx->bit_offset = bit_offset; ctx->block_index = block_index; ctx->coeff_index = coeff_index; - ctx->bit_offset = bit_offset; + ctx->last_y = last_y; + ctx->last_cr = last_cr; + ctx->last_cb = last_cb; return 1; } @@ -197,21 +203,24 @@ int DecDCTvlcStart2( return -1; ctx->input = &input[2]; - ctx->remaining = (header->mdec0_header & 0xffff) * 2; ctx->window = (input[0] << 16) | (input[0] >> 16); ctx->next_window = (input[1] << 16) | (input[1] >> 16); - ctx->quant_scale = (header->quant_scale & 63) << 10; + ctx->remaining = (header->mdec0_header & 0xffff) * 2; + ctx->is_v3 = (header->version >= 3); + ctx->bit_offset = 32; ctx->block_index = 0; ctx->coeff_index = 0; - ctx->bit_offset = 32; - ctx->is_v3 = (header->version == 3); + ctx->quant_scale = (header->quant_scale & 63) << 10; + ctx->last_y = 0; + ctx->last_cr = 0; + ctx->last_cb = 0; return DecDCTvlcContinue2(ctx, buf, max_size); } /* Stateful VLC decoder API (for Sony SDK compatibility) */ -int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB2 *table) { +int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB *table) { if (table) _vlc_huffman_table2 = table; @@ -230,7 +239,7 @@ size_t DecDCTvlcSize2(size_t size) { /* Lookup table decompressor */ -void DecDCTvlcBuild(DECDCTTAB2 *table) { +void DecDCTvlcBuild(DECDCTTAB *table) { uint32_t *output = (uint32_t *) table; _vlc_huffman_table2 = table; diff --git a/libpsn00b/psxspu/common.c b/libpsn00b/psxspu/common.c index 6ccbef4..1275621 100644 --- a/libpsn00b/psxspu/common.c +++ b/libpsn00b/psxspu/common.c @@ -1,6 +1,6 @@ /* * PSn00bSDK SPU library (common functions) - * (C) 2022 spicyjpeg - MPL licensed + * (C) 2022-2023 spicyjpeg - MPL licensed */ #include <stdint.h> @@ -32,7 +32,7 @@ static void _wait_status(uint16_t mask, uint16_t value) { return; } - _sdk_log("status register timeout (0x%04x)\n", SPU_STAT); + _sdk_log("timeout, status=0x%04x\n", SPU_STAT); } static size_t _dma_transfer(uint32_t *data, size_t length, int write) { |
