From a21e949c9aea98cb4b3feee48bb98579bbdfba70 Mon Sep 17 00:00:00 2001 From: spicyjpeg Date: Sun, 22 Jan 2023 15:40:14 +0100 Subject: Fix VSync(), assert(), warnings and some examples --- examples/demos/n00bdemo/main.c | 14 +++----------- examples/graphics/billboard/billboard.c | 2 +- examples/io/system573/iso.xml | 8 +++++--- examples/system/childexec/child/child.c | 7 ++++--- examples/system/childexec/parent.c | 22 +++++++++------------- 5 files changed, 22 insertions(+), 31 deletions(-) (limited to 'examples') diff --git a/examples/demos/n00bdemo/main.c b/examples/demos/n00bdemo/main.c index 6d0be3c..c9ca3ce 100644 --- a/examples/demos/n00bdemo/main.c +++ b/examples/demos/n00bdemo/main.c @@ -624,7 +624,7 @@ void plasmastuff() { // Simple stripe transition effect void transition() { - int i,count,comp; + int count = 0; int bheight[16] = { 0 }; TILE *tile = (TILE*)nextpri; @@ -632,9 +632,9 @@ void transition() { while( 1 ) { - comp = 0; + int comp = 0; - for( i=0; i<16; i++ ) { + for( int i=0; i<16; i++ ) { if( bheight[i] > 0 ) { @@ -657,19 +657,11 @@ void transition() { if( bheight[count>>1] == 0 ) bheight[count>>1] = 1; - display(); count++; if( comp >= 16 ) break; - - /* - I haven't yet managed to figure out why this loop hangs on no$psx - if I comment out this completely useless call to puts(). Some - alignment or timing crap perhaps? -- spicyjpeg - */ - puts("."); } DrawSync(0); diff --git a/examples/graphics/billboard/billboard.c b/examples/graphics/billboard/billboard.c index ea98b28..1ddc4dc 100644 --- a/examples/graphics/billboard/billboard.c +++ b/examples/graphics/billboard/billboard.c @@ -152,7 +152,7 @@ int main() { setRGB0(quad, 128, 128, 128); // Set tpage - quad->tpage = getTPage(tim.mode&0x8, 0, tim.prect->x, tim.prect->y); + quad->tpage = getTPage(tim.mode, 0, tim.prect->x, tim.prect->y); // Set CLUT setClut(quad, tim.crect->x, tim.crect->y); diff --git a/examples/io/system573/iso.xml b/examples/io/system573/iso.xml index 2226089..c960b8a 100644 --- a/examples/io/system573/iso.xml +++ b/examples/io/system573/iso.xml @@ -19,9 +19,9 @@ The System 573 BIOS does not parse SYSTEM.CNF, it's instead hardcoded to look for an executable named PSX.EXE. Some modded or hacked BIOS variants may instead look for slightly altered - file names (QSY.DXD, SSW.BXF, TSV.AXG) as an obfuscation - measure, so it's recommended to have multiple copies of the - executable on the disc. + file names (QSY.DXD, SSW.BXF, TSV.AXG, GSE.NXX, NSE.GXX) as an + obfuscation measure, so it's recommended to have multiple + copies of the executable on the disc. Note that this behavior can be abused to make multi-system CDs with different executables for PS1 and 573 (i.e. have both @@ -33,6 +33,8 @@ + + diff --git a/examples/system/childexec/child/child.c b/examples/system/childexec/child/child.c index dcfbfaf..b52dd32 100644 --- a/examples/system/childexec/child/child.c +++ b/examples/system/childexec/child/child.c @@ -238,11 +238,12 @@ int main(int argc, const char *argv[]) { display(); } - + + DrawSync(0); StopPAD(); - + StopCallback(); + return 0; - } void init(void) { diff --git a/examples/system/childexec/parent.c b/examples/system/childexec/parent.c index 83d964c..79c81f1 100644 --- a/examples/system/childexec/parent.c +++ b/examples/system/childexec/parent.c @@ -285,31 +285,27 @@ void run_child(void) { // Copy child executable to its intended adddress memcpy((void*)exe->param.t_addr, child_exe+2048, exe->param.t_size); - - // Prepare for program execution and disable interrupts - //EnterCriticalSection(); - StopCallback(); - // Stop pads, enable auto acknowledge + // Prepare for program execution and disable interrupts + DrawSync(0); StopPAD(); - ChangeClearPAD(1); - ChangeClearRCnt(3, 1); + StopCallback(); + FlushCache(); // Execute child - printf("Child exec!\n"); + printf("Executing child...\n"); Exec(&exe->param, 3, args); - + // Restore interrupts for this PS-EXE RestartCallback(); - //ExitCriticalSection(); - + printf("Child returned\n"); + // Re-init and re-enable pads InitPAD(pad_buff[0], 34, pad_buff[1], 34); StartPAD(); ChangeClearPAD(0); - + // Set this program's display mode SetDispMask(0); PutDispEnv(&disp); - } -- cgit v1.2.3 From 09f321e37fc187affa664d32e36e32c0533a7e8e Mon Sep 17 00:00:00 2001 From: spicyjpeg Date: Mon, 23 Jan 2023 09:36:22 +0100 Subject: Add BS v3 decoding, fix MDEC API and strvideo example --- examples/mdec/strvideo/main.c | 74 +++-- libpsn00b/include/psxpress.h | 126 ++++--- libpsn00b/psxpress/README.md | 29 +- libpsn00b/psxpress/mdec.c | 37 +-- libpsn00b/psxpress/vlc.c | 160 +++++---- libpsn00b/psxpress/vlc.s | 743 +++++++++++++++++++++++++++--------------- libpsn00b/psxpress/vlc2.c | 39 ++- libpsn00b/psxspu/common.c | 4 +- 8 files changed, 763 insertions(+), 449 deletions(-) (limited to 'examples') diff --git a/examples/mdec/strvideo/main.c b/examples/mdec/strvideo/main.c index 28d39b2..57cb6ef 100644 --- a/examples/mdec/strvideo/main.c +++ b/examples/mdec/strvideo/main.c @@ -1,6 +1,6 @@ /* * PSn00bSDK .STR FMV playback example - * (C) 2022 spicyjpeg - MPL licensed + * (C) 2022-2023 spicyjpeg - MPL licensed * * This example demonstrates playback of full-motion video in the standard .STR * format, using the MDEC for frame decoding and XA for audio. Decoded frames @@ -34,9 +34,10 @@ * Playback is stopped once the .STR header is no longer present in sectors * read. * - * Note that PSn00bSDK's bitstream decoding API only supports version 1 and 2 - * bitstreams currently, so make sure your .STR files are encoded as v2 and not - * v3. + * PSn00bSDK's bitstream decoding API supports both version 2 and 3 bitstreams. + * Encoding your .STR files as v3 may result in slightly higher quality + * depending on the encoder, but also higher CPU usage during playback compared + * to the older v2. */ #include @@ -102,13 +103,12 @@ void init_context(RenderContext *ctx) { FntOpen(4, 12, 312, 16, 2, 256); } -void display(RenderContext *ctx, int sync) { +void display(RenderContext *ctx) { Framebuffer *db; ctx->db_active ^= 1; DrawSync(0); - if (sync) - VSync(0); + //VSync(0); db = &(ctx->db[ctx->db_active]); PutDrawEnv(&(db->draw)); @@ -163,13 +163,13 @@ typedef struct { volatile int8_t cur_frame, cur_slice; } StreamContext; -StreamContext str_ctx; +static StreamContext str_ctx; // This buffer is used by cd_sector_handler() as a temporary area for sectors // read from the CD. Due to DMA limitations it can't be allocated on the stack // (especially not in the interrupt callbacks' stack, whose size is very // limited). -STR_Header sector_header; +static STR_Header sector_header; void cd_sector_handler(void) { StreamBuffer *frame = &str_ctx.frames[str_ctx.cur_frame]; @@ -268,7 +268,7 @@ void init_stream(void) { // optional but makes the decompressor slightly faster. See the libpsxpress // documentation for more details. DecDCTvlcSize(0x8000); - DecDCTvlcCopyTable((DECDCTTAB *) 0x1f800000); + DecDCTvlcCopyTableV3((VLC_TableV3 *) 0x1f800000); str_ctx.cur_frame = 0; str_ctx.cur_slice = 0; @@ -309,7 +309,7 @@ void start_stream(CdlFILE *file) { static RenderContext ctx; -#define SHOW_STATUS(...) { FntPrint(-1, __VA_ARGS__); FntFlush(-1); display(&ctx, 1); } +#define SHOW_STATUS(...) { FntPrint(-1, __VA_ARGS__); FntFlush(-1); display(&ctx); } #define SHOW_ERROR(...) { SHOW_STATUS(__VA_ARGS__); while (1) __asm__("nop"); } int main(int argc, const char* argv[]) { @@ -318,7 +318,7 @@ int main(int argc, const char* argv[]) { SHOW_STATUS("INITIALIZING\n"); SpuInit(); CdInit(); - InitGeom(); // Required for PSn00bSDK's DecDCTvlc() + InitGeom(); // GTE initialization required by the VLC decompressor DecDCTReset(0); SHOW_STATUS("OPENING VIDEO FILE\n"); @@ -330,8 +330,9 @@ int main(int argc, const char* argv[]) { init_stream(); start_stream(&file); - // Disable framebuffer clearing to get rid of flickering during playback. - display(&ctx, 1); + // Clear the screen, then disable framebuffer clearing to get rid of + // flickering during playback. + display(&ctx); ctx.db[0].draw.isbg = 0; ctx.db[1].draw.isbg = 0; #ifdef DISP_24BPP @@ -339,9 +340,13 @@ int main(int argc, const char* argv[]) { ctx.db[1].disp.isrgb24 = 1; #endif - int decode_errors = 0; + int frame_time = 1, decode_errors = 0; while (1) { +#ifdef DRAW_OVERLAY + int frame_start = TIMER_VALUE(1); +#endif + // Wait for a full frame to be read from the disc and decompress the // bitstream into the format expected by the MDEC. If the video has // ended, restart playback from the beginning. @@ -355,38 +360,45 @@ int main(int argc, const char* argv[]) { } #ifdef DRAW_OVERLAY - // Measure CPU usage of the decompressor using the hblank counter. - int total_time = TIMER_VALUE(1) + 1; - TIMER_VALUE(1) = 0; + int decode_time = TIMER_VALUE(1); #endif - if (DecDCTvlc(frame->bs_data, frame->mdec_data)) { + VLC_Context vlc_ctx; + if (DecDCTvlcStart( + &vlc_ctx, + frame->mdec_data, + sizeof(frame->mdec_data) / 4, + frame->bs_data + )) { decode_errors++; continue; } #ifdef DRAW_OVERLAY - int cpu_usage = TIMER_VALUE(1) * 100 / total_time; + // Calculate CPU usage of the decompressor. + decode_time = (TIMER_VALUE(1) - decode_time) & 0xffff; + int cpu_usage = decode_time * 100 / frame_time; #endif // Wait for the MDEC to finish decoding the previous frame, then flip // the framebuffers to display it and prepare the buffer for the next // frame. - // NOTE: you should *not* call VSync(0) during playback, as the refresh - // rate of the GPU is not synced to the video's frame rate. If you want - // to minimize screen tearing, consider triple buffering instead (i.e. - // always keep 2 fully decoded frames in VRAM and use VSyncCallback() - // to register a function that displays the next decoded frame whenever - // vblank occurs). + // NOTE: as the refresh rate of the GPU is not synced to the video's + // frame rate, this VSync(0) call may potentially end up waiting too + // long and desynchronizing playback. A better solution would be to + // implement triple buffering (i.e. always keep 2 fully decoded frames + // in VRAM and use VSyncCallback() to register a function that displays + // the next decoded frame if available whenever vblank occurs). + VSync(0); DecDCTinSync(0); DecDCToutSync(0); #ifdef DRAW_OVERLAY - FntPrint(-1, "FRAME:%5d READ ERRORS: %5d\n", str_ctx.frame_id, str_ctx.dropped_frames); - FntPrint(-1, "CPU: %5d%% DECODE ERRORS:%5d\n", cpu_usage, decode_errors); + FntPrint(-1, "FRAME:%6d READ ERRORS: %6d\n", str_ctx.frame_id, str_ctx.dropped_frames); + FntPrint(-1, "CPU: %6d%% DECODE ERRORS:%6d\n", cpu_usage, decode_errors); FntFlush(-1); #endif - display(&ctx, 0); + display(&ctx); // Feed the newly decompressed frame to the MDEC. The MDEC will not // actually start decoding it until an output buffer is also configured @@ -414,6 +426,10 @@ int main(int argc, const char* argv[]) { str_ctx.slices[str_ctx.cur_slice], BLOCK_SIZE * str_ctx.slice_pos.h / 2 ); + +#ifdef DRAW_OVERLAY + frame_time = (TIMER_VALUE(1) - frame_start) & 0xffff; +#endif } return 0; diff --git a/libpsn00b/include/psxpress.h b/libpsn00b/include/psxpress.h index dc1d52c..c3b13f4 100644 --- a/libpsn00b/include/psxpress.h +++ b/libpsn00b/include/psxpress.h @@ -1,6 +1,6 @@ /* * PSn00bSDK MDEC library - * (C) 2022 spicyjpeg - MPL licensed + * (C) 2022-2023 spicyjpeg - MPL licensed */ /** @@ -17,7 +17,9 @@ * FMV playback is not part of this library per se, but can implemented using * the APIs defined here alongside some code to stream data from the CD drive. * - * Currently only version 1 and 2 .BS files are supported. + * Currently bitstream versions 1, 2 and 3 are supported. Version 0 and .IKI + * bitstreams are not supported, but no encoder is publicly available for those + * anyway. */ #ifndef __PSXPRESS_H @@ -34,28 +36,26 @@ typedef struct _DECDCTENV { int16_t dct[64]; // Inverse DCT matrix (2.14 fixed-point) } DECDCTENV; -// This is the "small" lookup table used by DecDCTvlc(). It can be copied to -// the scratchpad. +typedef struct _VLC_TableV2 { + uint16_t ac0[2]; + uint32_t ac2[8], ac3[64]; + uint16_t ac4[8], ac5[8], ac7[16], ac8[32]; + uint16_t ac9[32], ac10[32], ac11[32], ac12[32]; +} VLC_TableV2; + +typedef struct _VLC_TableV3 { + uint16_t ac0[2]; + uint32_t ac2[8], ac3[64]; + uint16_t ac4[8], ac5[8], ac7[16], ac8[32]; + uint16_t ac9[32], ac10[32], ac11[32], ac12[32]; + uint8_t dc[128], dc_len[9]; + uint8_t _reserved[3]; +} VLC_TableV3; + typedef struct _DECDCTTAB { - uint16_t lut0[2]; - uint32_t lut2[8]; - uint32_t lut3[64]; - uint16_t lut4[8]; - uint16_t lut5[8]; - uint16_t lut7[16]; - uint16_t lut8[32]; - uint16_t lut9[32]; - uint16_t lut10[32]; - uint16_t lut11[32]; - uint16_t lut12[32]; + uint32_t ac[8192], ac00[512]; } DECDCTTAB; -// This is the "large" table used by DecDCTvlc2(). -typedef struct _DECDCTTAB2 { - uint32_t lut[8192]; - uint32_t lut00[512]; -} DECDCTTAB2; - typedef enum _DECDCTMODE { DECDCT_MODE_24BPP = 1, DECDCT_MODE_16BPP = 0, @@ -66,8 +66,9 @@ typedef enum _DECDCTMODE { typedef struct _VLC_Context { const uint32_t *input; uint32_t window, next_window, remaining; - uint16_t quant_scale; int8_t is_v3, bit_offset, block_index, coeff_index; + uint16_t quant_scale; + int16_t last_y, last_cr, last_cb; } VLC_Context; // Despite what some docs claim, the "number of 32-byte blocks" and "always @@ -233,8 +234,9 @@ int DecDCToutSync(int mode); * frame) into a buffer that can be passed to DecDCTin(). This function uses a * small (<1 KB) lookup table combined with the GTE to accelerate the process; * performance is roughly on par with DecDCTvlcStart2() if the lookup table - * is copied to the scratchpad beforehand by calling DecDCTvlcCopyTable(). The - * contents of the GTE's LZCR register, if any, will be destroyed. + * is copied to the scratchpad beforehand by calling DecDCTvlcCopyTableV2() or + * DecDCTvlcCopyTableV3(). The contents of the GTE's LZCS and LZCR registers, + * if any, will be destroyed. * * A VLC_Context object must be created and passed to this function, which will * then proceed to initialize its fields. The max_size argument sets the @@ -256,7 +258,7 @@ int DecDCToutSync(int mode); * @param bs * @return 0, 1 if more data needs to be output or -1 in case of failure * - * @see DecDCTvlcContinue(), DecDCTvlcCopyTable() + * @see DecDCTvlcContinue(), DecDCTvlcCopyTableV2(), DecDCTvlcCopyTableV3() */ int DecDCTvlcStart(VLC_Context *ctx, uint32_t *buf, size_t max_size, const uint32_t *bs); @@ -275,7 +277,8 @@ int DecDCTvlcStart(VLC_Context *ctx, uint32_t *buf, size_t max_size, const uint3 * context returned 0; in that case the context shall be discarded or reused to * decode another bitstream. * - * The contents of the GTE's LZCR register, if any, will be destroyed. + * The contents of the GTE's LZCS and LZCR registers, if any, will be + * destroyed. * * See DecDCTvlcStart() for more details. * @@ -309,7 +312,7 @@ int DecDCTvlcContinue(VLC_Context *ctx, uint32_t *buf, size_t max_size); * @param buf * @return 0, 1 if more data needs to be output or -1 in case of failure * - * @see DecDCTvlcSize(), DecDCTvlcCopyTable() + * @see DecDCTvlcSize(), DecDCTvlcCopyTableV2(), DecDCTvlcCopyTableV3() */ int DecDCTvlc(const uint32_t *bs, uint32_t *buf); @@ -332,23 +335,60 @@ int DecDCTvlc(const uint32_t *bs, uint32_t *buf); size_t DecDCTvlcSize(size_t size); /** - * @brief Moves the lookup table used by the .BS decompressor to the scratchpad - * region. + * @brief Copies the lookup tables used by the .BS decompressor (v1/v2) to the + * scratchpad region. + * + * @details Copies the lookup table used by DecDCTvlcContinue(), + * DecDCTvlcStart() and DecDCTvlc() to the specified address. A copy of this + * table is always present in main RAM, however this function can be used to + * copy it to the scratchpad region to boost decompression performance. + * + * This function copies a 676-byte table (VLC_TableV2 structure) containing + * only the data necessary for decoding version 1 and 2 bitstreams, to help + * save scratchpad space. If support for version 3 is required, + * DecDCTvlcCopyTableV3() can be used instead to copy the full 816-byte table. + * + * The address passed to this function is saved. Calls to DecDCTvlcStart(), + * DecDCTvlcContinue() and DecDCTvlc() will automatically use the last table + * copied. Call DecDCTvlcCopyTableV2(0) or DecDCTvlcCopyTableV3(0) to revert to + * using the library's internal table in main RAM. + * + * WARNING: attempting to decode a version 3 .BS file or .STR frame after + * calling this function will result in undefined behavior and potentially a + * crash. To re-enable version 3 decoding, use DecDCTvlcCopyTableV3() to copy + * the full table to the scratchpad or revert to using the built-in table in + * main RAM. + * + * @param addr Pointer to free 676-byte area in scratchpad region or 0 to reset + * + * @see DecDCTvlcCopyTableV3() + */ +void DecDCTvlcCopyTableV2(VLC_TableV2 *addr); + +/** + * @brief Copies the lookup tables used by the .BS decompressor (v1/v2/v3) to + * the scratchpad region. * - * @details Copies the small (<1 KB) lookup table used by DecDCTvlcContinue(), - * DecDCTvlcStart() and DecDCTvlc() (a DECDCTTAB structure) to the specified - * address. A copy of this table is always present in main RAM, however this - * function can be used to copy it to the scratchpad region to boost - * decompression performance. + * @details Copies the lookup table used by DecDCTvlcContinue(), + * DecDCTvlcStart() and DecDCTvlc() to the specified address. A copy of this + * table is always present in main RAM, however this function can be used to + * copy it to the scratchpad region to boost decompression performance. + * + * This function copies the full 816-byte table (VLC_TableV3 structure), + * including the data used to decode version 3 bitstreams. If support for + * version 3 is not required, DecDCTvlcCopyTableV2() can be used instead to + * save scratchpad space by only copying the first 676 bytes of the table. * * The address passed to this function is saved. Calls to DecDCTvlcStart(), * DecDCTvlcContinue() and DecDCTvlc() will automatically use the last table - * copied. Call DecDCTvlcCopyTable(0) to revert to using the library's internal - * table in main RAM. + * copied. Call DecDCTvlcCopyTableV2(0) or DecDCTvlcCopyTableV3(0) to revert to + * using the library's internal table in main RAM. + * + * @param addr Pointer to free 816-byte area in scratchpad region or 0 to reset * - * @param addr Pointer to free area in scratchpad region or 0 to reset + * @see DecDCTvlcCopyTableV2() */ -void DecDCTvlcCopyTable(DECDCTTAB *addr); +void DecDCTvlcCopyTableV3(VLC_TableV3 *addr); /** * @brief Decompresses or begins decompressing a .BS file into MDEC codes @@ -360,8 +400,8 @@ void DecDCTvlcCopyTable(DECDCTTAB *addr); * calling DecDCTvlcBuild(), but does not use the GTE nor the scratchpad. * Depending on the specific bitstream being decoded DecDCTvlcStart2() might be * slightly faster or slower than DecDCTvlcStart() with its lookup table copied - * to the scratchpad (see DecDCTvlcCopyTable()). DecDCTvlcStart() with the - * table in main RAM tends to be much slower. + * to the scratchpad (see DecDCTvlcCopyTableV2() and DecDCTvlcCopyTableV3()). + * DecDCTvlcStart() with the table in main RAM tends to be much slower. * * A VLC_Context object must be created and passed to this function, which will * then proceed to initialize its fields. The max_size argument sets the @@ -432,7 +472,7 @@ int DecDCTvlcContinue2(VLC_Context *ctx, uint32_t *buf, size_t max_size); * * @see DecDCTvlcSize2(), DecDCTvlcBuild() */ -int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB2 *table); +int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB *table); /** * @brief Sets the maximum amount of data to be decompressed (alternate @@ -458,7 +498,7 @@ size_t DecDCTvlcSize2(size_t size); * the .BS decompressor. * * @details Generates the lookup table required by DecDCTvlcStart2(), - * DecDCTvlcContinue2() and DecDCTvlc2() (a DECDCTTAB2 structure) into the + * DecDCTvlcContinue2() and DecDCTvlc2() (a DECDCTTAB structure) into the * specified buffer. Since the table is relatively large (34 KB), it is * recommended to only generate it in a dynamically-allocated buffer when * needed and deallocate the buffer afterwards. @@ -468,7 +508,7 @@ size_t DecDCTvlcSize2(size_t size); * * @param table */ -void DecDCTvlcBuild(DECDCTTAB2 *table); +void DecDCTvlcBuild(DECDCTTAB *table); #ifdef __cplusplus } diff --git a/libpsn00b/psxpress/README.md b/libpsn00b/psxpress/README.md index a894874..df18ec5 100644 --- a/libpsn00b/psxpress/README.md +++ b/libpsn00b/psxpress/README.md @@ -1,14 +1,19 @@ # PSn00bSDK MDEC library -This is a fully open source reimplementation of the official SDK's "data +This is a fully original reimplementation of the official SDK's "data compression" library. This library is made up of two parts, the MDEC API and functions to decompress Huffman-encoded bitstreams (.BS files, or frames in -.STR files) into data to be fed to the MDEC. FMV playback is not part of this -library (nor the official one) per se, but can implemented by using these APIs -alongside some code to stream data from the CD drive. +.STR files) into data to be fed to the MDEC. Two different implementations of +the latter are provided, one using the GTE and scratchpad region and an older +one using a large lookup table in main RAM. -**Currently only version 1 and 2 bitstreams are supported**. +FMV playback is not part of this library per se, but can implemented using the +APIs defined here alongside some code to stream data from the CD drive. + +Currently bitstream versions 1, 2 and 3 are supported. Version 0 and .IKI +bitstreams are not supported, but no encoder is publicly available for those +anyway. ## MDEC API @@ -26,14 +31,16 @@ The following functions are currently provided: - `DecDCTvlcStart()`, `DecDCTvlcContinue()`: a decompressor implementation that uses a small (<1 KB) lookup table and leverages the GTE, written in assembly. - `DecDCTvlcCopyTable()` can optionally be called to temporarily move the table - to the scratchpad region to improve decompression speed. -- `DecDCTvlcStart2()`, `DecDCTvlcContinue2()`: a different implementation using + `DecDCTvlcCopyTableV2()` or `DecDCTvlcCopyTableV3()` may optionally be called + to temporarily move the table to the scratchpad region in order to boost + decompression speed. +- `DecDCTvlcStart2()`, `DecDCTvlcContinue2()`: an older implementation using a large (34 KB) lookup table in main RAM, written in C. The table must be - decompressed ahead of time using `DecDCTvlcBuild()`, but can be deallocated - when no longer needed. + decompressed ahead of time manually using `DecDCTvlcBuild()`, but can be + deallocated when no longer needed. **This implementation does not support** + **version 3 bitstreams**. - `DecDCTvlc()`, `DecDCTvlc2()`: wrappers around the functions listed above, - for compatibility with the Sony SDK. Using them is not recommended. + for compatibility with the Sony SDK. ## SPU ADPCM encoding API diff --git a/libpsn00b/psxpress/mdec.c b/libpsn00b/psxpress/mdec.c index 3596188..394a0ce 100644 --- a/libpsn00b/psxpress/mdec.c +++ b/libpsn00b/psxpress/mdec.c @@ -1,12 +1,11 @@ /* * PSn00bSDK MDEC library (low-level MDEC/DMA API) - * (C) 2022 spicyjpeg - MPL licensed + * (C) 2022-2023 spicyjpeg - MPL licensed */ #include #include #include -#include #include #include @@ -15,14 +14,14 @@ /* Default IDCT matrix and quantization tables */ -#define S0 0x5a82 // 0x4000 * cos(0/16 * pi) * sqrt(2) -#define S1 0x7d8a // 0x4000 * cos(1/16 * pi) * 2 -#define S2 0x7641 // 0x4000 * cos(2/16 * pi) * 2 -#define S3 0x6a6d // 0x4000 * cos(3/16 * pi) * 2 -#define S4 0x5a82 // 0x4000 * cos(4/16 * pi) * 2 -#define S5 0x471c // 0x4000 * cos(5/16 * pi) * 2 -#define S6 0x30fb // 0x4000 * cos(6/16 * pi) * 2 -#define S7 0x18f8 // 0x4000 * cos(7/16 * pi) * 2 +#define S0 0x5a82 // (1 << 14) * cos(0/16 * pi) * sqrt(2) +#define S1 0x7d8a // (1 << 14) * cos(1/16 * pi) * 2 +#define S2 0x7641 // (1 << 14) * cos(2/16 * pi) * 2 +#define S3 0x6a6d // (1 << 14) * cos(3/16 * pi) * 2 +#define S4 0x5a82 // (1 << 14) * cos(4/16 * pi) * 2 +#define S5 0x471c // (1 << 14) * cos(5/16 * pi) * 2 +#define S6 0x30fb // (1 << 14) * cos(6/16 * pi) * 2 +#define S7 0x18f8 // (1 << 14) * cos(7/16 * pi) * 2 static const DECDCTENV _default_mdec_env = { // The default luma and chroma quantization table is based on the MPEG-1 @@ -85,8 +84,6 @@ static const DECDCTENV _default_mdec_env = { /* Public API */ void DecDCTReset(int mode) { - FastEnterCriticalSection(); - SetDMAPriority(DMA_MDEC_IN, 3); SetDMAPriority(DMA_MDEC_OUT, 3); DMA_CHCR(DMA_MDEC_IN) = 0x00000201; // Stop DMA @@ -95,26 +92,28 @@ void DecDCTReset(int mode) { MDEC1 = 0x80000000; // Reset MDEC MDEC1 = 0x60000000; // Enable DMA in/out requests - FastExitCriticalSection(); if (!mode) DecDCTPutEnv(0, 0); } void DecDCTPutEnv(const DECDCTENV *env, int mono) { - const DECDCTENV *_env = env ? env : &_default_mdec_env; DecDCTinSync(0); + if (!env) + env = &_default_mdec_env; MDEC0 = 0x60000000; // Set IDCT matrix - DecDCTinRaw((const uint32_t *) _env->dct, 32); + DecDCTinRaw((const uint32_t *) env->dct, 32); DecDCTinSync(0); - MDEC0 = 0x40000000 | (mono ? 0 : 1); // Set table(s) - DecDCTinRaw((const uint32_t *) _env->iq_y, mono ? 16 : 32); + MDEC0 = 0x40000000 | (mono ? 0 : 1); // Set quantization table(s) + DecDCTinRaw((const uint32_t *) env->iq_y, mono ? 16 : 32); DecDCTinSync(0); } void DecDCTin(const uint32_t *data, int mode) { uint32_t header = *data; + DecDCTinSync(0); + if (mode == DECDCT_MODE_RAW) MDEC0 = header; else if (mode & DECDCT_MODE_24BPP) @@ -153,7 +152,7 @@ int DecDCTinSync(int mode) { return 0; } - _sdk_log("DecDCTinSync() timeout\n"); + _sdk_log("DecDCTinSync() timeout, MDEC1=0x%08x\n", MDEC1); return -1; } @@ -184,6 +183,6 @@ int DecDCToutSync(int mode) { return 0; } - _sdk_log("DecDCToutSync() timeout\n"); + _sdk_log("DecDCToutSync() timeout, CHCR=0x%08x\n", DMA_CHCR(DMA_MDEC_OUT)); return -1; } diff --git a/libpsn00b/psxpress/vlc.c b/libpsn00b/psxpress/vlc.c index 4e3e283..36cfbe2 100644 --- a/libpsn00b/psxpress/vlc.c +++ b/libpsn00b/psxpress/vlc.c @@ -1,6 +1,6 @@ /* * PSn00bSDK MDEC library (support code for the main VLC decompressor) - * (C) 2022 spicyjpeg - MPL licensed + * (C) 2022-2023 spicyjpeg - MPL licensed */ #include @@ -10,87 +10,120 @@ /* Huffman code lookup table */ -#define _val1(rl, dc) (((rl) << 10) | ((uint16_t) (dc) & 0x3ff)) -#define _val2(rl, dc, len) (_val1(rl, dc) | (len << 16)) +#define _DC(y, c) (((y) << 4) | (c)) +#define _AC(rl, dc) (((rl) << 10) | ((uint16_t) (dc) & 0x3ff)) +#define _ACL(rl, dc, len) (_AC(rl, dc) | ((len) << 16)) -#define _pair(rl, dc) _val1(rl, dc), _val1(rl, -(dc)) -#define _pair2(rl, dc, len) _val2(rl, dc, len), _val2(rl, -(dc), len) -#define _pair3(rl, dc, len) \ - _val2(rl, dc, len), _val2(rl, dc, len), \ - _val2(rl, -(dc), len), _val2(rl, -(dc), len) -#define _pair4(rl, dc, len) \ - _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), \ - _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), \ - _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), \ - _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len) +#define _DC2(y, c) _DC(y, c), _DC(y, c) +#define _DC3(y, c) _DC(y, c), _DC(y, c), _DC(y, c), _DC(y, c) +#define _DC4(y, c) \ + _DC(y, c), _DC(y, c), _DC(y, c), _DC(y, c), \ + _DC(y, c), _DC(y, c), _DC(y, c), _DC(y, c) +#define _AC2(rl, dc) _AC(rl, dc), _AC(rl, -(dc)) +#define _ACL2(rl, dc, len) _ACL(rl, dc, len), _ACL(rl, -(dc), len) +#define _ACL3(rl, dc, len) \ + _ACL(rl, dc, len), _ACL(rl, dc, len), \ + _ACL(rl, -(dc), len), _ACL(rl, -(dc), len) +#define _ACL4(rl, dc, len) \ + _ACL(rl, dc, len), _ACL(rl, dc, len), _ACL(rl, dc, len), _ACL(rl, dc, len), \ + _ACL(rl, dc, len), _ACL(rl, dc, len), _ACL(rl, dc, len), _ACL(rl, dc, len), \ + _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), \ + _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), _ACL(rl, -(dc), len), _ACL(rl, -(dc), len) // This table isn't compressed since it makes no sense to compress less than a // kilobyte's worth of data. -static const DECDCTTAB _default_huffman_table = { - .lut0 = { +static const VLC_TableV3 _default_huffman_table = { + .ac0 = { // 11 x - _pair( 0, 1) + _AC2( 0, 1) }, - .lut2 = { + .ac2 = { // 01 0xx - _pair2( 0, 2, 5), _pair2( 2, 1, 5), + _ACL2( 0, 2, 5), _ACL2( 2, 1, 5), // 01 1x- - _pair3( 1, 1, 4) + _ACL3( 1, 1, 4) }, - .lut3 = { + .ac3 = { // 001 00xxxx - _pair2(13, 1, 9), _pair2( 0, 6, 9), _pair2(12, 1, 9), _pair2(11, 1, 9), - _pair2( 3, 2, 9), _pair2( 1, 3, 9), _pair2( 0, 5, 9), _pair2(10, 1, 9), + _ACL2(13, 1, 9), _ACL2( 0, 6, 9), _ACL2(12, 1, 9), _ACL2(11, 1, 9), + _ACL2( 3, 2, 9), _ACL2( 1, 3, 9), _ACL2( 0, 5, 9), _ACL2(10, 1, 9), // 001 xxx--- - _pair4( 0, 3, 6), _pair4( 4, 1, 6), _pair4( 3, 1, 6) + _ACL4( 0, 3, 6), _ACL4( 4, 1, 6), _ACL4( 3, 1, 6) }, - .lut4 = { + .ac4 = { // 0001 xxx - _pair( 7, 1), _pair( 6, 1), _pair( 1, 2), _pair( 5, 1) + _AC2( 7, 1), _AC2( 6, 1), _AC2( 1, 2), _AC2( 5, 1) }, - .lut5 = { + .ac5 = { // 00001 xxx - _pair( 2, 2), _pair( 9, 1), _pair( 0, 4), _pair( 8, 1) + _AC2( 2, 2), _AC2( 9, 1), _AC2( 0, 4), _AC2( 8, 1) }, - .lut7 = { + .ac7 = { // 0000001 xxxx - _pair(16, 1), _pair( 5, 2), _pair( 0, 7), _pair( 2, 3), - _pair( 1, 4), _pair(15, 1), _pair(14, 1), _pair( 4, 2) + _AC2(16, 1), _AC2( 5, 2), _AC2( 0, 7), _AC2( 2, 3), + _AC2( 1, 4), _AC2(15, 1), _AC2(14, 1), _AC2( 4, 2) }, - .lut8 = { + .ac8 = { // 00000001 xxxxx - _pair( 0, 11), _pair( 8, 2), _pair( 4, 3), _pair( 0, 10), - _pair( 2, 4), _pair( 7, 2), _pair(21, 1), _pair(20, 1), - _pair( 0, 9), _pair(19, 1), _pair(18, 1), _pair( 1, 5), - _pair( 3, 3), _pair( 0, 8), _pair( 6, 2), _pair(17, 1) + _AC2( 0, 11), _AC2( 8, 2), _AC2( 4, 3), _AC2( 0, 10), + _AC2( 2, 4), _AC2( 7, 2), _AC2(21, 1), _AC2(20, 1), + _AC2( 0, 9), _AC2(19, 1), _AC2(18, 1), _AC2( 1, 5), + _AC2( 3, 3), _AC2( 0, 8), _AC2( 6, 2), _AC2(17, 1) }, - .lut9 = { + .ac9 = { // 000000001 xxxxx - _pair(10, 2), _pair( 9, 2), _pair( 5, 3), _pair( 3, 4), - _pair( 2, 5), _pair( 1, 7), _pair( 1, 6), _pair( 0, 15), - _pair( 0, 14), _pair( 0, 13), _pair( 0, 12), _pair(26, 1), - _pair(25, 1), _pair(24, 1), _pair(23, 1), _pair(22, 1) + _AC2(10, 2), _AC2( 9, 2), _AC2( 5, 3), _AC2( 3, 4), + _AC2( 2, 5), _AC2( 1, 7), _AC2( 1, 6), _AC2( 0, 15), + _AC2( 0, 14), _AC2( 0, 13), _AC2( 0, 12), _AC2(26, 1), + _AC2(25, 1), _AC2(24, 1), _AC2(23, 1), _AC2(22, 1) }, - .lut10 = { + .ac10 = { // 0000000001 xxxxx - _pair( 0, 31), _pair( 0, 30), _pair( 0, 29), _pair( 0, 28), - _pair( 0, 27), _pair( 0, 26), _pair( 0, 25), _pair( 0, 24), - _pair( 0, 23), _pair( 0, 22), _pair( 0, 21), _pair( 0, 20), - _pair( 0, 19), _pair( 0, 18), _pair( 0, 17), _pair( 0, 16) + _AC2( 0, 31), _AC2( 0, 30), _AC2( 0, 29), _AC2( 0, 28), + _AC2( 0, 27), _AC2( 0, 26), _AC2( 0, 25), _AC2( 0, 24), + _AC2( 0, 23), _AC2( 0, 22), _AC2( 0, 21), _AC2( 0, 20), + _AC2( 0, 19), _AC2( 0, 18), _AC2( 0, 17), _AC2( 0, 16) }, - .lut11 = { + .ac11 = { // 00000000001 xxxxx - _pair( 0, 40), _pair( 0, 39), _pair( 0, 38), _pair( 0, 37), - _pair( 0, 36), _pair( 0, 35), _pair( 0, 34), _pair( 0, 33), - _pair( 0, 32), _pair( 1, 14), _pair( 1, 13), _pair( 1, 12), - _pair( 1, 11), _pair( 1, 10), _pair( 1, 9), _pair( 1, 8) + _AC2( 0, 40), _AC2( 0, 39), _AC2( 0, 38), _AC2( 0, 37), + _AC2( 0, 36), _AC2( 0, 35), _AC2( 0, 34), _AC2( 0, 33), + _AC2( 0, 32), _AC2( 1, 14), _AC2( 1, 13), _AC2( 1, 12), + _AC2( 1, 11), _AC2( 1, 10), _AC2( 1, 9), _AC2( 1, 8) }, - .lut12 = { + .ac12 = { // 000000000001 xxxxx - _pair( 1, 18), _pair( 1, 17), _pair( 1, 16), _pair( 1, 15), - _pair( 6, 3), _pair(16, 2), _pair(15, 2), _pair(14, 2), - _pair(13, 2), _pair(12, 2), _pair(11, 2), _pair(31, 1), - _pair(30, 1), _pair(29, 1), _pair(28, 1), _pair(27, 1) + _AC2( 1, 18), _AC2( 1, 17), _AC2( 1, 16), _AC2( 1, 15), + _AC2( 6, 3), _AC2(16, 2), _AC2(15, 2), _AC2(14, 2), + _AC2(13, 2), _AC2(12, 2), _AC2(11, 2), _AC2(31, 1), + _AC2(30, 1), _AC2(29, 1), _AC2(28, 1), _AC2(27, 1) + }, + .dc = { + // 00----- + _DC4(1, 0), _DC4(1, 0), _DC4(1, 0), _DC4(1, 0), + // 01----- + _DC4(2, 1), _DC4(2, 1), _DC4(2, 1), _DC4(2, 1), + // 100---- + _DC4(0, 2), _DC4(0, 2), + // 101---- + _DC4(3, 2), _DC4(3, 2), + // 110---- + _DC4(4, 3), _DC4(4, 3), + // 1110--- + _DC4(5, 4), + // 11110-- + _DC3(6, 5), + // 111110- + _DC2(7, 6), + // 1111110 + _DC(8, 7), + // 1111111(0) + _DC(0, 8) + }, + .dc_len = { + _DC(3, 2), _DC(2, 2), _DC(2, 2), _DC(3, 3), + _DC(3, 4), _DC(4, 5), _DC(5, 6), _DC(6, 7), + _DC(7, 8) } }; @@ -100,7 +133,7 @@ static const DECDCTTAB _default_huffman_table = { static VLC_Context _default_context; static size_t _max_buffer_size = 0; -const DECDCTTAB *_vlc_huffman_table = &_default_huffman_table; +const VLC_TableV3 *_vlc_huffman_table = &_default_huffman_table; /* Stateful VLC decoder API (for Sony SDK compatibility) */ @@ -120,10 +153,19 @@ size_t DecDCTvlcSize(size_t size) { /* Lookup table relocation API */ -void DecDCTvlcCopyTable(DECDCTTAB *addr) { +void DecDCTvlcCopyTableV2(VLC_TableV2 *addr) { + if (addr) { + _vlc_huffman_table = (const VLC_TableV3 *) addr; + memcpy(addr, &_default_huffman_table, sizeof(VLC_TableV2)); + } else { + _vlc_huffman_table = &_default_huffman_table; + } +} + +void DecDCTvlcCopyTableV3(VLC_TableV3 *addr) { if (addr) { - _vlc_huffman_table = addr; - memcpy(addr, &_default_huffman_table, sizeof(DECDCTTAB)); + _vlc_huffman_table = (const VLC_TableV3 *) addr; + memcpy(addr, &_default_huffman_table, sizeof(VLC_TableV3)); } else { _vlc_huffman_table = &_default_huffman_table; } diff --git a/libpsn00b/psxpress/vlc.s b/libpsn00b/psxpress/vlc.s index f3a1c67..2de22f7 100644 --- a/libpsn00b/psxpress/vlc.s +++ b/libpsn00b/psxpress/vlc.s @@ -1,375 +1,576 @@ # PSn00bSDK MDEC library (GTE-accelerated VLC decompressor) -# (C) 2022 spicyjpeg - MPL licensed +# (C) 2022-2023 spicyjpeg - MPL licensed # -# Register map: -# - $a0 = ctx -# - $a1 = output -# - $a2 = max_size -# - $a3 = input -# - $t0 = window -# - $t1 = next_window -# - $t2 = remaining -# - $t3 = quant_scale -# - $t4 = is_v3 -# - $t5 = bit_offset -# - $t6 = block_index -# - $t7 = coeff_index -# - $t8 = _vlc_huffman_table -# - $t9 = &ac_jump_area +# TODO: reduce the size of the v3 DC coefficient decoder; currently the code is +# duplicated for each block type, but it can probably be shortened with no +# performance impact... -.set noreorder +.include "gtereg.inc" -.set VLC_Context_input, 0 -.set VLC_Context_window, 4 -.set VLC_Context_next_window, 8 -.set VLC_Context_remaining, 12 -.set VLC_Context_quant_scale, 16 -.set VLC_Context_is_v3, 18 -.set VLC_Context_bit_offset, 19 -.set VLC_Context_block_index, 20 -.set VLC_Context_coeff_index, 21 - -.set DECDCTTAB_lut0, 0 -.set DECDCTTAB_lut2, 4 -.set DECDCTTAB_lut3, 36 -.set DECDCTTAB_lut4, 292 -.set DECDCTTAB_lut5, 308 -.set DECDCTTAB_lut7, 324 -.set DECDCTTAB_lut8, 356 -.set DECDCTTAB_lut9, 420 -.set DECDCTTAB_lut10, 484 -.set DECDCTTAB_lut11, 548 -.set DECDCTTAB_lut12, 612 +.set noreorder +.set noat + +.set value, $v0 +.set length, $v1 +.set ctx, $a0 +.set output, $a1 +.set max_size, $a2 +.set input, $a3 +.set temp, $t0 +.set window, $t1 +.set next_window, $t2 +.set remaining, $t3 +.set is_v3, $t4 +.set bit_offset, $t5 +.set block_index, $t6 +.set coeff_index, $t7 +.set quant_scale, $s0 +.set last_y, $s1 +.set last_cr, $s2 +.set last_cb, $s3 +.set huffman_table, $t8 +.set ac_jump_area, $t9 + +.set VLC_Context_input, 0x0 +.set VLC_Context_window, 0x4 +.set VLC_Context_next_window, 0x8 +.set VLC_Context_remaining, 0xc +.set VLC_Context_is_v3, 0x10 +.set VLC_Context_bit_offset, 0x11 +.set VLC_Context_block_index, 0x12 +.set VLC_Context_coeff_index, 0x13 +.set VLC_Context_quant_scale, 0x14 +.set VLC_Context_last_y, 0x16 +.set VLC_Context_last_cr, 0x18 +.set VLC_Context_last_cb, 0x1a + +.set VLC_Table_ac0, 0x0 +.set VLC_Table_ac2, 0x4 +.set VLC_Table_ac3, 0x24 +.set VLC_Table_ac4, 0x124 +.set VLC_Table_ac5, 0x134 +.set VLC_Table_ac7, 0x144 +.set VLC_Table_ac8, 0x164 +.set VLC_Table_ac9, 0x1a4 +.set VLC_Table_ac10, 0x1e4 +.set VLC_Table_ac11, 0x224 +.set VLC_Table_ac12, 0x264 +.set VLC_Table_dc, 0x2a4 +.set VLC_Table_dc_len, 0x324 .section .text.DecDCTvlcStart .global DecDCTvlcStart .type DecDCTvlcStart, @function DecDCTvlcStart: + addiu $sp, -16 + sw $s0, 0($sp) + sw $s1, 4($sp) + sw $s2, 8($sp) + sw $s3, 12($sp) + # Create a new context on-the-fly without writing it to memory then jump # into DecDCTvlcContinue(), skipping context loading. - lw $t0, 8($a3) # window = (bs->data[0] << 16) | (bs->data[0] >> 16) - nop - srl $v0, $t0, 16 - sll $t0, 16 - - lw $t1, 12($a3) # next_window = (bs->data[1] << 16) | (bs->data[1] >> 16) - or $t0, $v0 - srl $v0, $t1, 16 - sll $t1, 16 - - lhu $t2, 0($a3) # remaining = bs->uncomp_length * 2 - or $t1, $v0 - - lhu $t3, 4($a3) # quant_scale = (bs->quant_scale & 63) << 10 - sll $t2, 1 - andi $t3, 63 - - lhu $t4, 6($a3) # is_v3 = !(bs->version < 3) - sll $t3, 10 - sltiu $t4, $t4, 3 - xori $t4, 1 - - li $t5, 32 # bit_offset = 32 - li $t6, 5 # block_index = 5 - li $t7, 0 # coeff_index = 0 + lw window, 8(input) # window = (bs->data[0] << 16) | (bs->data[0] >> 16) + li last_y, 0 + srl temp, window, 16 + sll window, 16 + or window, temp + + # next_window = (bs->data[1] << 16) | (bs->data[1] >> 16) + lw next_window, 12(input) + li last_cr, 0 + srl temp, next_window, 16 + sll next_window, 16 + or next_window, temp + + lhu remaining, 0(input) # remaining = bs->uncomp_length * 2 + li last_cb, 0 + sll remaining, 1 + + lw temp, 4(input) # quant_scale = (bs->quant_scale & 63) << 10 + li bit_offset, 32 + andi quant_scale, temp, 63 + sll quant_scale, 10 + + srl temp, 16 # is_v3 = !(bs->version < 3) + sltiu is_v3, temp, 3 + xori is_v3, 1 + + li block_index, 5 + li coeff_index, 0 j _vlc_skip_context_load - addiu $a3, 16 # input = &(bs->data[2]) + addiu input, 16 # input = &(bs->data[2]) .section .text.DecDCTvlcContinue .global DecDCTvlcContinue .type DecDCTvlcContinue, @function DecDCTvlcContinue: - lw $a3, VLC_Context_input($a0) - lw $t0, VLC_Context_window($a0) - lw $t1, VLC_Context_next_window($a0) - lw $t2, VLC_Context_remaining($a0) - lhu $t3, VLC_Context_quant_scale($a0) - lb $t4, VLC_Context_is_v3($a0) - lb $t5, VLC_Context_bit_offset($a0) - lb $t6, VLC_Context_block_index($a0) - lb $t7, VLC_Context_coeff_index($a0) + addiu $sp, -16 + sw $s0, 0($sp) + sw $s1, 4($sp) + sw $s2, 8($sp) + sw $s3, 12($sp) + + lw input, VLC_Context_input(ctx) + lw window, VLC_Context_window(ctx) + lw next_window, VLC_Context_next_window(ctx) + lw remaining, VLC_Context_remaining(ctx) + lb is_v3, VLC_Context_is_v3(ctx) + lb bit_offset, VLC_Context_bit_offset(ctx) + lb block_index, VLC_Context_block_index(ctx) + lb coeff_index, VLC_Context_coeff_index(ctx) + lhu quant_scale, VLC_Context_quant_scale(ctx) + lh last_y, VLC_Context_last_y(ctx) + lh last_cr, VLC_Context_last_cr(ctx) + lh last_cb, VLC_Context_last_cb(ctx) _vlc_skip_context_load: - # Determine how many bytes to output. This whole block of code basically - # does this: + # Determine how many bytes to output. + # if (max_size <= 0) max_size = 0x3fff0000 # max_size = min((max_size - 1) * 2, remaining) # remaining -= max_size - bgtz $a2, .Lmax_size_valid # if (max_size <= 0) max_size = 0x7ffe0000 - addiu $a2, -1 # else max_size = (max_size - 1) * 2 - lui $a2, 0x3fff + bgtz max_size, .Lmax_size_valid + addiu max_size, -1 + lui max_size, 0x3fff .Lmax_size_valid: - sll $a2, 1 + sll max_size, 1 - blt $a2, $t2, .Lmax_size_ok # if (max_size > remaining) max_size = remaining - lui $v1, 0x3800 - move $a2, $t2 -.Lmax_size_ok: - subu $t2, $a2 # remaining -= max_size + subu remaining, max_size + bgez remaining, .Lmax_size_ok + lui temp, 0x3800 + addu max_size, remaining + li remaining, 0 + +.Lmax_size_ok: # Write the length of the data that will be decoded to first 4 bytes of the # output buffer, which will be then parsed by DecDCTin(). - srl $v0, $a2, 1 # output[0] = 0x38000000 | (max_size / 2) - or $v0, $v1 - sw $v0, 0($a1) + srl value, max_size, 1 # output[0] = 0x38000000 | (max_size / 2) + or value, temp + sw value, 0(output) # Obtain the addresses of the lookup table and jump area in advance so that # they don't have to be retrieved for each coefficient decoded. - lw $t8, _vlc_huffman_table - la $t9, .Lac_prefix_10 + lw huffman_table, _vlc_huffman_table + la ac_jump_area, .Lac_prefix_01 - 32 - beqz $a2, .Lstop_processing - addiu $a1, 4 # output = (uint16_t *) &output[1] + beqz max_size, .Lstop_processing + addiu output, 4 .Lprocess_next_code_loop: # while (max_size) # This is the "hot" part of the decoder, executed for each code in the # bitstream. The first step is to determine if the next code is a DC or AC - # coefficient. - bnez $t7, .Lprocess_ac_coefficient - addiu $t7, 1 # coeff_index++ - bnez $t4, .Lprocess_dc_v3_coefficient - li $v1, 0x01ff + # coefficient; at the same time the GTE is given the task of counting the + # number of leading zeroes/ones in the code (which takes 2 more cycles). + mtc2 window, C2_LZCS + + bnez coeff_index, .Lprocess_ac_coefficient + addiu coeff_index, 1 + bnez is_v3, .Lprocess_dc_v3_coefficient + li temp, 0x1ff .Lprocess_dc_v2_coefficient: # if (!coeff_index && !is_v3) # The DC coefficient in version 2 frames is not compressed. Value 0x1ff is # used to signal the end of the bitstream. - srl $v0, $t0, 22 # prefix = (window >> (32 - 10)) - beq $v0, $v1, .Lstop_processing # if (prefix == 0x1ff) break - or $v0, $t3 # *output = prefix | quant_scale - sll $t0, 10 # window <<= 10 - b .Lwrite_value - addiu $t5, -10 # bit_offset -= 10 + # prefix = window >> (32 - 10) + # if (prefix == 0x1ff) break + # *output = prefix | quant_scale + srl value, window, 22 + beq value, temp, .Lstop_processing + or value, quant_scale + sll window, 10 + addiu bit_offset, -10 + + b .Lfeed_bitstream + sh value, 0(output) .Lprocess_dc_v3_coefficient: # if (!coeff_index && is_v3) - # TODO: version 3 is currently not supported. - jr $ra - li $v0, -1 - -.Lprocess_ac_coefficient: # if (coeff_index) - # Check whether the prefix code is one of the shorter, more common ones, - # and start counting the number of leading zeroes/ones using the GTE (which - # takes 2 more cycles). - srl $v0, $t0, 30 - li $v1, 3 - beq $v0, $v1, .Lac_prefix_11 - li $v1, 2 - beq $v0, $v1, .Lac_prefix_10 - li $v1, 1 - mtc2 $t0, $30 - beq $v0, $v1, .Lac_prefix_01 + # Version 3 DC coefficients are variable-length deltas, prefixed with a + # Huffman code indicating their length. Since the prefix code is up to 7 + # bits long, it makes sense to decode it with a simple 128-byte lookup + # table rather than using the GTE. The codes are different for luma and + # chroma blocks, so each table entry contains the decoded length for both + # block types (packed as two nibbles). Prefix 111111111 is used to signal + # the end of the bitstream. + # prefix = window >> (32 - 9) + # if (prefix == 0x1ff) break + # lengths = huffman_table->dc[prefix >> 2] + srl length, window, 23 + beq length, temp, .Lstop_processing + srl length, 2 + addu length, huffman_table + + addiu $at, block_index, -4 + bltz $at, .Ldc_block_y + lbu length, VLC_Table_dc(length) + beqz $at, .Ldc_block_cb + andi length, 15 # if (block_index >= Cb) dc_length = lengths & 15 + +.Ldc_block_cr: # if (block_index > Cb) + # prefix_length = huffman_table->dc_len[dc_length] & 15 + addu temp, length, huffman_table + lbu temp, VLC_Table_dc_len(temp) + li $at, 32 + andi temp, 15 + + sllv window, window, temp + beqz length, .Ldc_cr_zero # if (dc_length) + subu bit_offset, temp + + subu $at, length # value = window >> (32 - dc_length) + srlv value, window, $at + + # Decode the sign bit, then add the decoded delta to the current value. + # if (!(window >> 31)) value -= (1 << dc_length) - 1 + bltz window, .Ldc_cr_positive + li temp, -1 + srlv temp, temp, $at + subu value, temp +.Ldc_cr_positive: + addu last_cr, value + andi last_cr, 0x3ff + +.Ldc_cr_zero: + sll temp, last_cr, 2 # *output = (last_cr << 2) | quant_scale + or temp, quant_scale + b .Lupdate_window_dc # update_window(dc_length) + sh temp, 0(output) + +.Ldc_block_cb: # if (block_index == Cb) + # prefix_length = huffman_table->dc_len[dc_length] & 15 + addu temp, length, huffman_table + lbu temp, VLC_Table_dc_len(temp) + li $at, 32 + andi temp, 15 + + sllv window, window, temp + beqz length, .Ldc_cb_zero # if (dc_length) + subu bit_offset, temp + + subu $at, length # value = window >> (32 - dc_length) + srlv value, window, $at + + # Decode the sign bit, then add the decoded delta to the current value. + # if (!(window >> 31)) value -= (1 << dc_length) - 1 + bltz window, .Ldc_cb_positive + li temp, -1 + srlv temp, temp, $at + subu value, temp +.Ldc_cb_positive: + addu last_cb, value + andi last_cb, 0x3ff + +.Ldc_cb_zero: + sll value, last_cb, 2 # *output = (last_cb << 2) | quant_scale + or value, quant_scale + b .Lupdate_window_dc # update_window(dc_length) + sh value, 0(output) + +.Ldc_block_y: # if (block_index < Cb) nop + srl length, 4 # dc_length = lengths >> 4 + + # prefix_length = huffman_table->dc_len[dc_length] >> 4 + addu temp, length, huffman_table + lbu temp, VLC_Table_dc_len(temp) + li $at, 32 + srl temp, 4 + + sllv window, window, temp + beqz length, .Ldc_y_zero # if (dc_length) + subu bit_offset, temp + + sll temp, last_y, 2 + subu $at, length # value = window >> (32 - dc_length) + srlv value, window, $at + + # Decode the sign bit, then add the decoded delta to the current value. + # if (!(window >> 31)) value -= (1 << dc_length) - 1 + bltz window, .Ldc_y_positive + li temp, -1 + srlv temp, temp, $at + subu value, temp +.Ldc_y_positive: + addu last_y, value + andi last_y, 0x3ff + +.Ldc_y_zero: + sll temp, last_y, 2 # *output = (last_y << 2) | quant_scale + or temp, quant_scale + b .Lupdate_window_dc # update_window(dc_length) + sh temp, 0(output) - # If the code is longer, retrieve the number of leading zeroes from the GTE - # and use it as an index into the jump area. Each block in the area is 8 - # instructions long and handles decoding a specific prefix. - mfc2 $v0, $31 - li $v1, 11 - bgt $v0, $v1, .Lreturn_error # if (prefix > 11) return -1 - sll $v0, 5 # jump_addr = &ac_jump_area[prefix * 8 * sizeof(u32)] - addu $v0, $t9 - jr $v0 +.Lprocess_ac_coefficient: # if (coeff_index) + # Check whether the prefix code is 10 or 11 (i.e. if it starts with 1). If + # not, retrieve the number of leading zeroes from the GTE and use it as an + # index into the jump area. Each block in the area is 8 instructions long + # and handles decoding a specific prefix. + mfc2 temp, C2_LZCR + + bltz window, .Lac_prefix_1 # if (!(window >> 31)) + addiu $at, temp, -11 # if (prefix > 11) return -1 + bgtz $at, .Lreturn_error + sll temp, 5 # jump_addr = &ac_jump_area[prefix * 8 * sizeof(uint32_t)] + addu temp, ac_jump_area + jr temp nop .Lreturn_error: - jr $ra + b .Lreturn li $v0, -1 -.Lac_prefix_11: - # Prefix 11 is followed by a single bit. - srl $v0, $t0, 28 # index = ((window >> (32 - 2 - 1)) & 1) * sizeof(u16) - andi $v0, 2 - addu $v0, $t8 # value = table->lut0[index] - lhu $v0, DECDCTTAB_lut0($v0) - sll $t0, 3 # window <<= 3 - b .Lwrite_value - addiu $t5, -3 # bit_offset -= 3 - #.word 0 +.Lac_prefix_1: # if (window >> 31) + sll window, 1 + bltz window, .Lac_prefix_11 + li temp, 0xfe00 .Lac_prefix_10: # Prefix 10 marks the end of a block. - li $v0, 0xfe00 # value = 0xfe00 - sll $t0, 2 # window <<= 2 - addiu $t5, -2 # bit_offset -= 2 - addiu $t6, -1 # block_index-- - bgez $t6, .Lwrite_value - li $t7, 0 # coeff_index = 0 - b .Lwrite_value - li $t6, 5 # if (block_index < 0) block_index = 5 + # *output = 0xfe00 + # coeff_index = 0 + # if (--block_index < Y3) block_index = Cr + sll window, 1 + addiu bit_offset, -2 + sh temp, 0(output) + + addiu block_index, -1 + bgez block_index, .Lfeed_bitstream + li coeff_index, 0 + b .Lfeed_bitstream + li block_index, 5 + +.Lac_prefix_11: + # Prefix 11 is followed by a single bit. Note that the 10/11 prefix check + # already shifts the window by one bit (without updating the bit offset). + # index = ((window >> (32 - 1 - 1)) & 1) * sizeof(uint16_t) + # *output = huffman_table->ac0[index] + srl value, window, 29 + andi value, 2 + addu value, huffman_table + lhu value, VLC_Table_ac0(value) + sll window, 2 + addiu bit_offset, -3 + + b .Lfeed_bitstream + sh value, 0(output) .Lac_prefix_01: # Prefix 01 can be followed by a 2-bit lookup index starting with 1, or a # 3-bit lookup index starting with 0. A 32-bit lookup table is used, # containing both MDEC codes and lengths. - srl $v0, $t0, 25 # index = ((window >> (32 - 2 - 3)) & 7) * sizeof(u32) - andi $v0, 28 - addu $v0, $t8 # value = table->lut2[index] - lw $v0, DECDCTTAB_lut2($v0) - b .Lupdate_window_and_write - srl $v1, $v0, 16 # length = value >> 16 + # index = ((window >> (32 - 2 - 3)) & 7) * sizeof(uint32_t) + # *output = huffman_table->ac2[index] & 0xffff + # length = huffman_table->ac2[index] >> 16 + srl value, window, 25 + andi value, 28 + addu value, huffman_table + lw value, VLC_Table_ac2(value) + + b .Lupdate_window_ac # update_window(value >> 16) + sh value, 0(output) .word 0, 0 .Lac_prefix_001: # Prefix 001 can be followed by a 6-bit lookup index starting with 00, or a # 3-bit lookup index starting with 01/10/11. - srl $v0, $t0, 21 # index = ((window >> (32 - 3 - 6)) & 63) * sizeof(u32) - andi $v0, 252 - addu $v0, $t8 # value = table->lut3[index] - lw $v0, DECDCTTAB_lut3($v0) - b .Lupdate_window_and_write - srl $v1, $v0, 16 # length = value >> 16 + # index = ((window >> (32 - 3 - 6)) & 63) * sizeof(uint32_t) + # *output = huffman_table->ac3[index] & 0xffff + # length = huffman_table->ac3[index] >> 16 + srl value, window, 21 + andi value, 252 + addu value, huffman_table + lw value, VLC_Table_ac3(value) + + b .Lupdate_window_ac # update_window(value >> 16) + sh value, 0(output) .word 0, 0 .Lac_prefix_0001: # Prefix 0001 is followed by a 3-bit lookup index. - srl $v0, $t0, 24 # index = ((window >> (32 - 4 - 3)) & 7) * sizeof(u16) - andi $v0, 14 - addu $v0, $t8 # value = table->lut4[index] - lhu $v0, DECDCTTAB_lut4($v0) - sll $t0, 7 # window <<= 4 + 3 - b .Lwrite_value - addiu $t5, -7 # bit_offset -= 4 + 3 - .word 0 + # index = ((window >> (32 - 4 - 3)) & 7) * sizeof(uint16_t) + # *output = huffman_table->ac4[index] + srl value, window, 24 + andi value, 14 + addu value, huffman_table + lhu value, VLC_Table_ac4(value) + sll window, 7 + addiu bit_offset, -7 + + b .Lfeed_bitstream + sh value, 0(output) .Lac_prefix_00001: # Prefix 00001 is followed by a 3-bit lookup index. - srl $v0, $t0, 23 # index = ((window >> (32 - 5 - 3)) & 7) * sizeof(u16) - andi $v0, 14 - addu $v0, $t8 # value = table->lut5[index] - lhu $v0, DECDCTTAB_lut5($v0) - sll $t0, 8 # window <<= 5 + 3 - b .Lwrite_value - addiu $t5, -8 # bit_offset -= 5 + 3 - .word 0 + # index = ((window >> (32 - 5 - 3)) & 7) * sizeof(uint16_t) + # *output = huffman_table->ac5[index] + srl value, window, 23 + andi value, 14 + addu value, huffman_table + lhu value, VLC_Table_ac5(value) + sll window, 8 + addiu bit_offset, -8 + + b .Lfeed_bitstream + sh value, 0(output) .Lac_prefix_000001: # Prefix 000001 is an escape code followed by a full 16-bit MDEC value. - srl $v0, $t0, 10 # value = window >> (32 - 6 - 16) - sll $t0, 22 # window <<= 6 + 16 - b .Lwrite_value - addiu $t5, -22 # bit_offset -= 6 + 16 - .word 0, 0, 0, 0 + # *output = window >> (32 - 6 - 16) + srl value, window, 10 + sll window, 22 + addiu bit_offset, -22 + + b .Lfeed_bitstream + sh value, 0(output) + .word 0, 0, 0 .Lac_prefix_0000001: # Prefix 0000001 is followed by a 4-bit lookup index. - srl $v0, $t0, 20 # index = ((window >> (32 - 7 - 4)) & 15) * sizeof(u16) - andi $v0, 30 - addu $v0, $t8 # value = table->lut7[index] - lhu $v0, DECDCTTAB_lut7($v0) - sll $t0, 11 # window <<= 7 + 4 - b .Lwrite_value - addiu $t5, -11 # bit_offset -= 7 + 4 - .word 0 + # index = ((window >> (32 - 7 - 4)) & 15) * sizeof(uint16_t) + # *output = huffman_table->ac7[index] + srl value, window, 20 + andi value, 30 + addu value, huffman_table + lhu value, VLC_Table_ac7(value) + sll window, 11 + addiu bit_offset, -11 + + b .Lfeed_bitstream + sh value, 0(output) .Lac_prefix_00000001: # Prefix 00000001 is followed by a 5-bit lookup index. - srl $v0, $t0, 18 # index = ((window >> (32 - 8 - 5)) & 31) * sizeof(u16) - andi $v0, 62 - addu $v0, $t8 # value = table->lut8[index] - lhu $v0, DECDCTTAB_lut8($v0) - sll $t0, 13 # window <<= 8 + 5 - b .Lwrite_value - addiu $t5, -13 # bit_offset -= 8 + 5 - .word 0 + # index = ((window >> (32 - 8 - 5)) & 31) * sizeof(uint16_t) + # *output = huffman_table->ac8[index] + srl value, window, 18 + andi value, 62 + addu value, huffman_table + lhu value, VLC_Table_ac8(value) + sll window, 13 + addiu bit_offset, -13 + + b .Lfeed_bitstream + sh value, 0(output) .Lac_prefix_000000001: # Prefix 000000001 is followed by a 5-bit lookup index. - srl $v0, $t0, 17 # index = ((window >> (32 - 9 - 5)) & 31) * sizeof(u16) - andi $v0, 62 - addu $v0, $t8 # value = table->lut9[index] - lhu $v0, DECDCTTAB_lut9($v0) - sll $t0, 14 # window <<= 9 + 5 - b .Lwrite_value - addiu $t5, -14 # bit_offset -= 9 + 5 - .word 0 + # index = ((window >> (32 - 9 - 5)) & 31) * sizeof(uint16_t) + # *output = huffman_table->ac9[index] + srl value, window, 17 + andi value, 62 + addu value, huffman_table + lhu value, VLC_Table_ac9(value) + sll window, 14 + addiu bit_offset, -14 + + b .Lfeed_bitstream + sh value, 0(output) .Lac_prefix_0000000001: # Prefix 0000000001 is followed by a 5-bit lookup index. - srl $v0, $t0, 16 # index = ((window >> (32 - 10 - 5)) & 31) * sizeof(u16) - andi $v0, 62 - addu $v0, $t8 # value = table->lut10[index] - lhu $v0, DECDCTTAB_lut10($v0) - sll $t0, 15 # window <<= 10 + 5 - b .Lwrite_value - addiu $t5, -15 # bit_offset -= 10 + 5 - .word 0 + # index = ((window >> (32 - 10 - 5)) & 31) * sizeof(uint16_t) + # *output = huffman_table->ac10[index] + srl value, window, 16 + andi value, 62 + addu value, huffman_table + lhu value, VLC_Table_ac10(value) + sll window, 15 + addiu bit_offset, -15 + + b .Lfeed_bitstream + sh value, 0(output) .Lac_prefix_00000000001: # Prefix 00000000001 is followed by a 5-bit lookup index. - srl $v0, $t0, 15 # index = ((window >> (32 - 11 - 5)) & 31) * sizeof(u16) - andi $v0, 62 - addu $v0, $t8 # value = table->lut11[index] - lhu $v0, DECDCTTAB_lut11($v0) - sll $t0, 16 # window <<= 11 + 5 - b .Lwrite_value - addiu $t5, -16 # bit_offset -= 11 + 5 - .word 0 + # index = ((window >> (32 - 11 - 5)) & 31) * sizeof(uint16_t) + # *output = huffman_table->ac11[index] + srl value, window, 15 + andi value, 62 + addu value, huffman_table + lhu value, VLC_Table_ac11(value) + sll window, 16 + addiu bit_offset, -16 + + b .Lfeed_bitstream + sh value, 0(output) .Lac_prefix_000000000001: # Prefix 000000000001 is followed by a 5-bit lookup index. - srl $v0, $t0, 14 # index = ((window >> (32 - 12 - 5)) & 31) * sizeof(u16) - andi $v0, 62 - addu $v0, $t8 # value = table->lut12[index] - lhu $v0, DECDCTTAB_lut12($v0) - sll $t0, 17 # window <<= 12 + 5 - b .Lwrite_value - addiu $t5, -17 # bit_offset -= 12 + 5 - .word 0 - -.Lupdate_window_and_write: - sllv $t0, $t0, $v1 # window <<= length - subu $t5, $v1 # bit_offset -= length -.Lwrite_value: - sh $v0, 0($a1) + # index = ((window >> (32 - 12 - 5)) & 31) * sizeof(uint16_t) + # *output = huffman_table->ac12[index] + srl value, window, 14 + andi value, 62 + addu value, huffman_table + lhu value, VLC_Table_ac12(value) + sll window, 17 + addiu bit_offset, -17 + + b .Lfeed_bitstream + sh value, 0(output) + +.Lupdate_window_ac: + srl length, value, 16 +.Lupdate_window_dc: + sllv window, window, length + subu bit_offset, length + .Lfeed_bitstream: # Update the window. This makes sure the next iteration of the loop will be # able to read up to 32 bits from the bitstream. - bgez $t5, .Lskip_feeding # if (bit_offset < 0) - addiu $a2, -1 # max_size-- - - subu $v0, $0, $t5 # window = next_window << (-bit_offset) - sllv $t0, $t1, $v0 - lw $t1, 0($a3) # next_window = (*input << 16) | (*input >> 16) - addiu $t5, 32 # bit_offset += 32 - srl $v0, $t1, 16 - sll $t1, 16 - or $t1, $v0 - addiu $a3, 4 # input++ + bgez bit_offset, .Lskip_feeding # if (bit_offset < 0) + addiu max_size, -1 + + subu temp, $0, bit_offset # window = next_window << (-bit_offset) + sllv window, next_window, temp + lw next_window, 0(input) # next_window = (*input << 16) | (*input >> 16) + addiu bit_offset, 32 + srl temp, next_window, 16 + sll next_window, 16 + or next_window, temp + addiu input, 4 .Lskip_feeding: - srlv $v0, $t1, $t5 # window |= next_window >> bit_offset - or $t0, $v0 + srlv temp, next_window, bit_offset # window |= next_window >> bit_offset + or window, temp - bnez $a2, .Lprocess_next_code_loop - addiu $a1, 2 # output++ + bnez max_size, .Lprocess_next_code_loop + addiu output, 2 .Lstop_processing: # If remaining = 0, skip flushing the context, pad the output buffer with # end-of-block codes if necessary and return 0. Otherwise flush the context # and return 1. - beqz $t2, .Lpad_output_buffer - nop - - sw $a3, VLC_Context_input($a0) - sw $t0, VLC_Context_window($a0) - sw $t1, VLC_Context_next_window($a0) - sw $t2, VLC_Context_remaining($a0) - sh $t3, VLC_Context_quant_scale($a0) - sb $t4, VLC_Context_is_v3($a0) - sb $t5, VLC_Context_bit_offset($a0) - sb $t6, VLC_Context_block_index($a0) - sb $t7, VLC_Context_coeff_index($a0) - - jr $ra + beqz remaining, .Lpad_output_buffer + li temp, 0xfe00 + + sw input, VLC_Context_input(ctx) + sw window, VLC_Context_window(ctx) + sw next_window, VLC_Context_next_window(ctx) + sw remaining, VLC_Context_remaining(ctx) + sb bit_offset, VLC_Context_bit_offset(ctx) + sb block_index, VLC_Context_block_index(ctx) + sb coeff_index, VLC_Context_coeff_index(ctx) + sh last_y, VLC_Context_last_y(ctx) + sh last_cr, VLC_Context_last_cr(ctx) + sh last_cb, VLC_Context_last_cb(ctx) + + b .Lreturn li $v0, 1 .Lpad_output_buffer: - beqz $a2, .Lreturn_zero - li $v0, 0xfe00 -.Lpad_output_buffer_loop: # while (max_size) - sh $v0, 0($a1) # *output = 0xfe00 - addiu $a2, -1 # max_size-- - bnez $a2, .Lpad_output_buffer_loop - addiu $a1, 2 # output++ + beqz max_size, .Lreturn + li $v0, 0 -.Lreturn_zero: +.Lpad_output_buffer_loop: # while (max_size) + sh temp, 0(output) + addiu max_size, -1 + bnez max_size, .Lpad_output_buffer_loop + addiu output, 2 + +.Lreturn: + lw $s0, 0($sp) + lw $s1, 4($sp) + lw $s2, 8($sp) + lw $s3, 12($sp) jr $ra - li $v0, 0 + addiu $sp, 16 diff --git a/libpsn00b/psxpress/vlc2.c b/libpsn00b/psxpress/vlc2.c index 9eb99bf..24c54ce 100644 --- a/libpsn00b/psxpress/vlc2.c +++ b/libpsn00b/psxpress/vlc2.c @@ -63,7 +63,7 @@ static const uint32_t _compressed_table[TABLE_LENGTH] = { static VLC_Context _default_context; static size_t _max_buffer_size = 0; -const DECDCTTAB2 *_vlc_huffman_table2 = 0; +const DECDCTTAB *_vlc_huffman_table2 = 0; /* VLC decoder */ @@ -77,14 +77,17 @@ int __attribute__((optimize(3))) DecDCTvlcContinue2( VLC_Context *ctx, uint32_t *buf, size_t max_size ) { const uint32_t *input = ctx->input; - uint32_t remaining = ctx->remaining; uint32_t window = ctx->window; uint32_t next_window = ctx->next_window; - uint16_t quant_scale = ctx->quant_scale; + uint32_t remaining = ctx->remaining; + int is_v3 = ctx->is_v3; + int bit_offset = ctx->bit_offset; int block_index = ctx->block_index; int coeff_index = ctx->coeff_index; - int bit_offset = ctx->bit_offset; - int is_v3 = ctx->is_v3; + uint16_t quant_scale = ctx->quant_scale; + int16_t last_y = ctx->last_y; + int16_t last_cr = ctx->last_cr; + int16_t last_cb = ctx->last_cb; //if (!_vlc_huffman_table2) //return -1; @@ -122,13 +125,13 @@ int __attribute__((optimize(3))) DecDCTvlcContinue2( } else if (window >> 24) { // The first lookup table is for codes that not start with // 00000000. - value = _vlc_huffman_table2->lut[_get_bits_unsigned(13)]; + value = _vlc_huffman_table2->ac[_get_bits_unsigned(13)]; _advance_window(value >> 16); *output = (uint16_t) value; } else { // If the code starts with 00000000, use the second lookup // table. - value = _vlc_huffman_table2->lut00[_get_bits_unsigned(17)]; + value = _vlc_huffman_table2->ac00[_get_bits_unsigned(17)]; _advance_window(value >> 16); *output = (uint16_t) value; } @@ -176,12 +179,15 @@ int __attribute__((optimize(3))) DecDCTvlcContinue2( return 0; ctx->input = input; - ctx->remaining = remaining; ctx->window = window; ctx->next_window = next_window; + ctx->remaining = remaining; + ctx->bit_offset = bit_offset; ctx->block_index = block_index; ctx->coeff_index = coeff_index; - ctx->bit_offset = bit_offset; + ctx->last_y = last_y; + ctx->last_cr = last_cr; + ctx->last_cb = last_cb; return 1; } @@ -197,21 +203,24 @@ int DecDCTvlcStart2( return -1; ctx->input = &input[2]; - ctx->remaining = (header->mdec0_header & 0xffff) * 2; ctx->window = (input[0] << 16) | (input[0] >> 16); ctx->next_window = (input[1] << 16) | (input[1] >> 16); - ctx->quant_scale = (header->quant_scale & 63) << 10; + ctx->remaining = (header->mdec0_header & 0xffff) * 2; + ctx->is_v3 = (header->version >= 3); + ctx->bit_offset = 32; ctx->block_index = 0; ctx->coeff_index = 0; - ctx->bit_offset = 32; - ctx->is_v3 = (header->version == 3); + ctx->quant_scale = (header->quant_scale & 63) << 10; + ctx->last_y = 0; + ctx->last_cr = 0; + ctx->last_cb = 0; return DecDCTvlcContinue2(ctx, buf, max_size); } /* Stateful VLC decoder API (for Sony SDK compatibility) */ -int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB2 *table) { +int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB *table) { if (table) _vlc_huffman_table2 = table; @@ -230,7 +239,7 @@ size_t DecDCTvlcSize2(size_t size) { /* Lookup table decompressor */ -void DecDCTvlcBuild(DECDCTTAB2 *table) { +void DecDCTvlcBuild(DECDCTTAB *table) { uint32_t *output = (uint32_t *) table; _vlc_huffman_table2 = table; diff --git a/libpsn00b/psxspu/common.c b/libpsn00b/psxspu/common.c index 6ccbef4..1275621 100644 --- a/libpsn00b/psxspu/common.c +++ b/libpsn00b/psxspu/common.c @@ -1,6 +1,6 @@ /* * PSn00bSDK SPU library (common functions) - * (C) 2022 spicyjpeg - MPL licensed + * (C) 2022-2023 spicyjpeg - MPL licensed */ #include @@ -32,7 +32,7 @@ static void _wait_status(uint16_t mask, uint16_t value) { return; } - _sdk_log("status register timeout (0x%04x)\n", SPU_STAT); + _sdk_log("timeout, status=0x%04x\n", SPU_STAT); } static size_t _dma_transfer(uint32_t *data, size_t length, int write) { -- cgit v1.2.3 From 8e3a757d4d7d5dfc62f69ce4ede08f1cf79e3461 Mon Sep 17 00:00:00 2001 From: spicyjpeg Date: Tue, 4 Apr 2023 00:46:20 +0200 Subject: Add IsIdleGPU(), tweak psxgpu.h, fix mdec/strvideo --- examples/demos/n00bdemo/main.c | 29 ++++++++++++------------- examples/mdec/strvideo/main.c | 19 +++++++++++------ libpsn00b/include/psxgpu.h | 48 +++++++++++++++++++++++++++--------------- libpsn00b/include/psxpress.h | 5 ++--- libpsn00b/psxgpu/common.c | 15 +++++++++++++ libpsn00b/psxpress/vlc2.c | 11 ++++------ 6 files changed, 78 insertions(+), 49 deletions(-) (limited to 'examples') diff --git a/examples/demos/n00bdemo/main.c b/examples/demos/n00bdemo/main.c index c9ca3ce..55dbbc4 100644 --- a/examples/demos/n00bdemo/main.c +++ b/examples/demos/n00bdemo/main.c @@ -358,10 +358,9 @@ void stencilstuff() { /* The stencil demo is achieved by utilizing the mask bit setting primitive GP0(E6h). The structure of this primitive is defined as - DR_MASK initialized and set by setDrawMask(). These are not available - in Sony's SDK by default. + DR_STP initialized and set by setDrawStp(). - The DR_MASK primitive controls mask bit operations for drawing + The DR_STP primitive controls mask bit operations for drawing primitives such as setting mask bits on every pixel drawn or mask bit test where pixels won't be drawn on pixels with the mask bit set. It applies to most graphics drawing primitives except VRAM fill. @@ -373,10 +372,10 @@ void stencilstuff() { bit operation disabled. The stencil effect featured in this demo is achieved by enabling set - mask bit with DR_MASK, drawing semi-transparent primitives using + mask bit with DR_STP, drawing semi-transparent primitives using additive blending but color is all zero to make it completely invisible but is enough to update the mask bits, disable mask set bit but enable - mask test with DR_MASK and then drawing a rectangle that fills the + mask test with DR_STP and then drawing a rectangle that fills the entire screen. Semi-transparency mask in textures must not be used when drawing the scene that will be 'below' the mask layer. */ @@ -384,7 +383,7 @@ void stencilstuff() { int spin=0; - DR_MASK *mask; + DR_STP *mask; TILE *rect; SC_OT s_ot; @@ -430,10 +429,10 @@ void stencilstuff() { // Sort mask primitive that enables setting mask bits - mask = (DR_MASK*)nextpri; - setDrawMask( mask, 1, 0 ); + mask = (DR_STP*)nextpri; + setDrawStp( mask, 1, 0 ); addPrim( ot[db]+20, mask ); - nextpri += sizeof(DR_MASK); + nextpri += sizeof(DR_STP); // Sort the stars @@ -465,10 +464,10 @@ void stencilstuff() { // Sort mask primitive that enables mask bit test - mask = (DR_MASK*)nextpri; - setDrawMask( mask, 0, 1 ); + mask = (DR_STP*)nextpri; + setDrawStp( mask, 0, 1 ); addPrim( ot[db]+18, mask ); - nextpri += sizeof(DR_MASK); + nextpri += sizeof(DR_STP); // Sort rectangle that fills the screen @@ -482,10 +481,10 @@ void stencilstuff() { // Clear all mask settings - mask = (DR_MASK*)nextpri; - setDrawMask( mask, 0, 0 ); + mask = (DR_STP*)nextpri; + setDrawStp( mask, 0, 0 ); addPrim( ot[db]+15, mask ); - nextpri += sizeof(DR_MASK); + nextpri += sizeof(DR_STP); // Sort overlay then display diff --git a/examples/mdec/strvideo/main.c b/examples/mdec/strvideo/main.c index 57cb6ef..853e0c2 100644 --- a/examples/mdec/strvideo/main.c +++ b/examples/mdec/strvideo/main.c @@ -189,8 +189,15 @@ void cd_sector_handler(void) { return; // If this sector is actually part of a new frame, validate the sectors - // that have been read so far and flip the bitstream data buffers. - if (sector_header.frame_id != str_ctx.frame_id) { + // that have been read so far and flip the bitstream data buffers. If the + // frame number is actually lower than the current one, assume the drive + // has started reading another .STR file and stop playback. + if ((int) sector_header.frame_id < str_ctx.frame_id) { + str_ctx.frame_ready = -1; + return; + } + + if ((int) sector_header.frame_id > str_ctx.frame_id) { // Do not set the ready flag if any sector has been missed. if (str_ctx.sector_count) str_ctx.dropped_frames++; @@ -263,11 +270,9 @@ void init_stream(void) { CdReadyCallback(&cd_event_handler); ExitCriticalSection(); - // Set the maximum amount of data DecDCTvlc() can output and copy the - // lookup table used for decompression to the scratchpad area. This is - // optional but makes the decompressor slightly faster. See the libpsxpress - // documentation for more details. - DecDCTvlcSize(0x8000); + // Copy the lookup table used for frame decompression to the scratchpad + // area. This is optional but makes the decompressor slightly faster. See + // the libpsxpress documentation for more details. DecDCTvlcCopyTableV3((VLC_TableV3 *) 0x1f800000); str_ctx.cur_frame = 0; diff --git a/libpsn00b/include/psxgpu.h b/libpsn00b/include/psxgpu.h index 2329908..78d8342 100644 --- a/libpsn00b/include/psxgpu.h +++ b/libpsn00b/include/psxgpu.h @@ -83,7 +83,7 @@ typedef enum _GPU_VideoMode { (p)->u0 = (_u0), (p)->v0 = (_v0), \ (p)->u1 = (_u1), (p)->v1 = (_v1), \ (p)->u2 = (_u2), (p)->v2 = (_v2) - + #define setUV4(p, _u0, _v0, _u1, _v1, _u2, _v2, _u3, _v3) \ (p)->u0 = (_u0), (p)->v0 = (_v0), \ (p)->u1 = (_u1), (p)->v1 = (_v1), \ @@ -202,38 +202,48 @@ typedef enum _GPU_VideoMode { #define setTexWindow_T(p, r) \ (p)->code[0] = (0xe2000000 | \ - ((r)->w % 32) | \ - (((r)->h % 32) << 5) | \ - (((r)->x % 32) << 10) | \ - (((r)->y % 32) << 15) \ + ((r)->w & 0x1f) | \ + (((r)->h & 0x1f) << 5) | \ + (((r)->x & 0x1f) << 10) | \ + (((r)->y & 0x1f) << 15) \ ) #define setTexWindow(p, r) \ setlen(p, 1), setTexWindow_T(p, r) -#define setDrawArea_T(p, r) \ +#define setDrawAreaXY_T(p, _x0, _y0, _x1, _y1) \ (p)->code[0] = (0xe3000000 | \ - ((r)->x % 1024) | \ - (((r)->y % 1024) << 10) \ + ((_x0) & 0x3ff) | \ + (((_y0) & 0x3ff) << 10) \ ), \ (p)->code[1] = (0xe4000000 | \ - (((r)->x + (r)->w - 1) % 1024) | \ - ((((r)->y + (r)->h - 1) % 1024) << 10) \ + ((_x1) & 0x3ff) | \ + (((_y1) & 0x3ff) << 10) \ + ) +#define setDrawAreaXY(p, _x0, _y0, _x1, _y1) \ + setlen(p, 2), setDrawAreaXY_T(p, _x0, _y0, _x1, _y1) + +#define setDrawArea_T(p, r) \ + setDrawAreaXY_T(p, \ + (r)->x, \ + (r)->y, \ + (r)->x + (r)->w - 1, \ + (r)->y + (r)->h - 1 \ ) #define setDrawArea(p, r) \ setlen(p, 2), setDrawArea_T(p, r) #define setDrawOffset_T(p, _x, _y) \ (p)->code[0] = (0xe5000000 | \ - ((_x) % 1024) | \ - (((_y) % 1024) << 11) \ + ((_x) & 0x7ff) | \ + (((_y) & 0x7ff) << 11) \ ) #define setDrawOffset(p, _x, _y) \ setlen(p, 1), setDrawOffset_T(p, _x, _y) -#define setDrawMask_T(p, sb, mt) \ - (p)->code[0] = (0xe6000000 | (sb) | ((mt) << 1)) -#define setDrawMask(p, sb, mt) \ - setlen(p, 1), setDrawMask_T(p, sb, mt) +#define setDrawStp_T(p, pbw, mt) \ + (p)->code[0] = (0xe6000000 | (pbw) | ((mt) << 1)) +#define setDrawStp(p, pbw, mt) \ + setlen(p, 1), setDrawStp_T(p, pbw, mt) /* Primitive structure definitions */ @@ -469,7 +479,7 @@ _DEF_PRIM(DR_TWIN, _DEF_PRIM(DR_TPAGE, uint32_t code[1]; ) -_DEF_PRIM(DR_MASK, +_DEF_PRIM(DR_STP, uint32_t code[1]; ) @@ -481,6 +491,9 @@ _DEF_PRIM(DR_ENV, FILL_T fill; ) +#undef _DEF_PRIM +#undef _DEF_ALIAS + /* Structure definitions */ typedef struct _RECT { @@ -545,6 +558,7 @@ void PutDrawEnv(DRAWENV *env); void PutDrawEnvFast(DRAWENV *env); int GetODE(void); +int IsIdleGPU(int timeout); int VSync(int mode); void *VSyncHaltFunction(void (*func)(void)); void *VSyncCallback(void (*func)(void)); diff --git a/libpsn00b/include/psxpress.h b/libpsn00b/include/psxpress.h index c3b13f4..ea0c2ec 100644 --- a/libpsn00b/include/psxpress.h +++ b/libpsn00b/include/psxpress.h @@ -246,8 +246,6 @@ int DecDCToutSync(int mode); * can be different). If max_size = 0, the entire frame will always be decoded * in one shot. * - * Only bitstream version 2 is currently supported. - * * WARNING: InitGeom() must be called prior to using DecDCTvlcStart() for the * first time. Attempting to call this function with the GTE disabled will * result in a crash. @@ -411,7 +409,8 @@ void DecDCTvlcCopyTableV3(VLC_TableV3 *addr); * buffer can be different). If max_size = 0, the entire frame will always be * decoded in one shot. * - * Only bitstream version 2 is currently supported. + * This function only supports decoding version 1 and 2 bitstreams. Use + * DecDCTvlcStart() to decode a version 3 bitstream. * * @param ctx Pointer to VLC_Context structure (which will be initialized) * @param buf diff --git a/libpsn00b/psxgpu/common.c b/libpsn00b/psxgpu/common.c index 537f672..e354261 100644 --- a/libpsn00b/psxgpu/common.c +++ b/libpsn00b/psxgpu/common.c @@ -321,3 +321,18 @@ void DrawOTag2(const uint32_t *ot) { DMA_BCR(DMA_GPU) = 0; DMA_CHCR(DMA_GPU) = 0x01000401; } + +/* Queue pause/resume API */ + +int IsIdleGPU(int timeout) { + if (timeout <= 0) + timeout = 1; + + for (; timeout; timeout--) { + if (GPU_GP1 & (1 << 26)) + return 0; + } + + //_sdk_log("IsIdleGPU() timeout\n"); + return -1; +} diff --git a/libpsn00b/psxpress/vlc2.c b/libpsn00b/psxpress/vlc2.c index 24c54ce..7d9d9f3 100644 --- a/libpsn00b/psxpress/vlc2.c +++ b/libpsn00b/psxpress/vlc2.c @@ -123,7 +123,7 @@ int __attribute__((optimize(3))) DecDCTvlcContinue2( *output = (uint16_t) _get_bits_unsigned(22); _advance_window(22); } else if (window >> 24) { - // The first lookup table is for codes that not start with + // The first lookup table is for codes that do not start with // 00000000. value = _vlc_huffman_table2->ac[_get_bits_unsigned(13)]; _advance_window(value >> 16); @@ -136,12 +136,9 @@ int __attribute__((optimize(3))) DecDCTvlcContinue2( *output = (uint16_t) value; } } else { - // Parse the DC (first) coefficient for this block. Version 2 - // simply stores the signed 10-bit value as-is, while version 3 - // uses a delta encoding combined with a compression method similar - // to exp-Golomb. + // Parse the DC (first) coefficient for this block. if (is_v3) { - // TODO: version 3 is currently not supported. + // This implementation does not support version 3. return -1; } else { value = _get_bits_unsigned(10); @@ -161,7 +158,7 @@ int __attribute__((optimize(3))) DecDCTvlcContinue2( // time and processes each 16-bit word starting from the the MSB, so an // endianness conversion is necessary to preserve bit order when // reading 32 bits at a time. Also note that the PS1 CPU is not capable - // of shifting by more than 31 bits - it will shift by 0 bits instead! + // of shifting by >=31 bits - it will shift by (N % 32) bits instead! if (bit_offset < 0) { window = next_window << (-bit_offset); bit_offset += 32; -- cgit v1.2.3 From fd846206ae9419af5ed227989b3ad49b541a823c Mon Sep 17 00:00:00 2001 From: spicyjpeg Date: Tue, 4 Apr 2023 01:09:50 +0200 Subject: Add missing CD image dependencies to CMake scripts --- examples/cdrom/cdbrowse/CMakeLists.txt | 5 ++++- examples/cdrom/cdbrowse/iso.xml | 5 +---- examples/cdrom/cdxa/CMakeLists.txt | 6 ++++-- examples/cdrom/cdxa/iso.xml | 5 +---- examples/io/system573/iso.xml | 5 +---- examples/mdec/strvideo/CMakeLists.txt | 5 ++++- examples/mdec/strvideo/iso.xml | 5 +---- examples/sound/cdstream/CMakeLists.txt | 5 ++++- examples/sound/cdstream/iso.xml | 5 +---- examples/system/dynlink/CMakeLists.txt | 2 +- examples/system/dynlink/iso.xml | 5 +---- libpsn00b/cmake/internal_setup.cmake | 15 +++++++-------- template/CMakeLists.txt | 2 +- template/iso.xml | 5 +---- 14 files changed, 32 insertions(+), 43 deletions(-) (limited to 'examples') diff --git a/examples/cdrom/cdbrowse/CMakeLists.txt b/examples/cdrom/cdbrowse/CMakeLists.txt index 0cc091f..70a4585 100644 --- a/examples/cdrom/cdbrowse/CMakeLists.txt +++ b/examples/cdrom/cdbrowse/CMakeLists.txt @@ -13,7 +13,10 @@ project( file(GLOB _sources *.c) psn00bsdk_add_executable(cdbrowse GPREL ${_sources}) -psn00bsdk_add_cd_image(cdbrowse_iso cdbrowse iso.xml DEPENDS cdbrowse) +psn00bsdk_add_cd_image( + cdbrowse_iso cdbrowse iso.xml + DEPENDS cdbrowse system.cnf +) psn00bsdk_target_incbin(cdbrowse PRIVATE ball16c ball16c.tim) diff --git a/examples/cdrom/cdbrowse/iso.xml b/examples/cdrom/cdbrowse/iso.xml index 771b0e9..f1c00f7 100644 --- a/examples/cdrom/cdbrowse/iso.xml +++ b/examples/cdrom/cdbrowse/iso.xml @@ -1,8 +1,5 @@ - + - + - + - + - + - + - +