diff options
| author | spicyjpeg <thatspicyjpeg@gmail.com> | 2022-07-17 21:33:35 +0200 |
|---|---|---|
| committer | spicyjpeg <thatspicyjpeg@gmail.com> | 2022-07-17 21:33:35 +0200 |
| commit | 5b63607ba4ca12c2a2935ea9618b3ffe6a6d3ab3 (patch) | |
| tree | f7ec51b65eb148d4fbddca5127589592e57d219c | |
| parent | c800972bc13ad0c7015b7d44fe9f124b719e792e (diff) | |
| download | psn00bsdk-5b63607ba4ca12c2a2935ea9618b3ffe6a6d3ab3.tar.gz | |
Add experimental psxpress Huffman decoding API
| -rw-r--r-- | libpsn00b/include/psxpress.h | 250 | ||||
| -rw-r--r-- | libpsn00b/psxpress/generate_lookup_table.py | 297 | ||||
| -rw-r--r-- | libpsn00b/psxpress/vlc.c | 130 | ||||
| -rw-r--r-- | libpsn00b/psxpress/vlc.s | 404 | ||||
| -rw-r--r-- | libpsn00b/psxpress/vlc2.c | 240 |
5 files changed, 1313 insertions, 8 deletions
diff --git a/libpsn00b/include/psxpress.h b/libpsn00b/include/psxpress.h index 6203c2a..2106a53 100644 --- a/libpsn00b/include/psxpress.h +++ b/libpsn00b/include/psxpress.h @@ -17,6 +17,28 @@ typedef struct _DECDCTENV { int16_t dct[64]; // Inverse DCT matrix (2.14 fixed-point) } DECDCTENV; +// This is the "small" lookup table used by DecDCTvlc(). It can be copied to +// the scratchpad. +typedef struct _DECDCTTAB { + uint16_t lut0[2]; + uint32_t lut2[8]; + uint32_t lut3[64]; + uint16_t lut4[8]; + uint16_t lut5[8]; + uint16_t lut7[16]; + uint16_t lut8[32]; + uint16_t lut9[32]; + uint16_t lut10[32]; + uint16_t lut11[32]; + uint16_t lut12[32]; +} DECDCTTAB; + +// This is the "large" table used by DecDCTvlc2(). +typedef struct _DECDCTTAB2 { + uint32_t lut[8192]; + uint32_t lut00[512]; +} DECDCTTAB2; + typedef enum _DECDCTMODE { DECDCT_MODE_24BPP = 1, DECDCT_MODE_16BPP = 0, @@ -24,6 +46,23 @@ typedef enum _DECDCTMODE { DECDCT_MODE_RAW = -1 } DECDCTMODE; +typedef struct _VLC_Context { + const uint32_t *input; + uint32_t window, next_window, remaining; + uint16_t quant_scale; + int8_t is_v3, bit_offset, block_index, coeff_index; +} VLC_Context; + +// Despite what some docs claim, the "number of 32-byte blocks" and "always +// 0x3800" fields are actually a single 32-bit field which is copied over to +// the output buffer, then parsed by DecDCTin() and written to the MDEC0 +// register. +typedef struct { + uint32_t mdec0_header; + uint16_t quant_scale; + uint16_t version; +} BS_Header; + /* Public API */ #ifdef __cplusplus @@ -64,12 +103,12 @@ void DecDCTReset(int mode); void DecDCTPutEnv(const DECDCTENV *env, int mono); /** - * @brief Sets up the MDEC to start fetching and decoding a stream from the - * given address in main RAM. The first 32-bit word is initially copied to the - * MDEC0 register, then all subsequent data is read in 128-byte (32-word) - * chunks. The length of the stream (in 32-bit units, minus the first word) - * must be encoded in the lower 16 bits of the first word, as expected by the - * MDEC. + * @brief Sets up the MDEC to start fetching and decoding the given buffer. + * This function is meant to be used with buffers generated by DecDCTvlc(): the + * first 32-bit word of the buffer is initially copied to the MDEC0 register, + * then all subsequent data is read in 128-byte (32-word) chunks. The length of + * the stream (in 32-bit units, minus the first word) is encoded by DecDCTvlc() + * in the lower 16 bits of the first word. * * The mode argument optionally specifies the output color depth (0 for 16bpp, * 1 for 24bpp) if not already set in the first word. Passing -1 will result in @@ -111,7 +150,7 @@ void DecDCTinRaw(const uint32_t *data, size_t length); * stream (usually a whole frame) is being written to main RAM. * * @param mode - * @return 0 or -1 in case of a timeout (mode = 0) / MDEC busy flag (mode = 1) + * @return 0 or -1 in case of a timeout (mode = 0), MDEC busy flag (mode = 1) */ int DecDCTinSync(int mode); @@ -142,10 +181,205 @@ void DecDCTout(uint32_t *data, size_t length); * to register a callback that calls DecDCTin() to feed the MDEC. * * @param mode - * @return 0 or -1 in case of a timeout (mode = 0) / DMA busy flag (mode = 1) + * @return 0 or -1 in case of a timeout (mode = 0), DMA busy flag (mode = 1) */ int DecDCToutSync(int mode); +/** + * @brief Begins decompressing the contents of a .BS file (or of a single .STR + * frame) into a buffer that can be passed to DecDCTin(). This function uses a + * small (<1 KB) lookup table combined with the GTE to accelerate the process; + * performance is roughly on par with DecDCTvlcStart2() if the lookup table + * is copied to the scratchpad beforehand by calling DecDCTvlcCopyTable(). The + * contents of the GTE's LZCR register, if any, will be destroyed. + * + * A VLC_Context object must be created and passed to this function, which will + * then proceed to initialize its fields. The max_size argument sets the + * maximum number of words that will be written to the output buffer; if more + * data needs to be written, this function will return 1. To continue decoding + * call DecDCTvlcContinue() with the same VLC_Context object (the output buffer + * can be different). If max_size = 0, the entire frame will always be decoded + * in one shot. + * + * Only bitstream version 2 is currently supported. + * + * WARNING: InitGeom() must be called prior to using DecDCTvlcStart() for the + * first time. Attempting to call this function with the GTE disabled will + * result in a crash. + * + * @param ctx Pointer to VLC_Context structure (which will be initialized) + * @param buf + * @param max_size Maximum number of 32-bit words to output + * @param bs + * @return 0, 1 if more data needs to be output or -1 in case of failure + */ +int DecDCTvlcStart(VLC_Context *ctx, uint32_t *buf, size_t max_size, const uint32_t *bs); + +/** + * @brief Resumes the decompression process started by DecDCTvlcStart(). The + * state of the decompressor is contained entirely in the VLC_Context structure + * so an arbitrary number of bitstreams can be decoded concurrently (although + * the limited CPU power makes it impractical to do so) by keeping a separate + * context for each bitstream. + * + * This function behaves like DecDCTvlcStart(), returning 1 if more data has to + * be written or 0 otherwise. DecDCTvlcContinue() shall not be called after a + * previous call to DecDCTvlcStart() or DecDCTvlcContinue() with the same + * context returned 0; in that case the context shall be discarded or reused to + * decode another bitstream. + * + * The contents of the GTE's LZCR register, if any, will be destroyed. + * + * See DecDCTvlcStart() for more details. + * + * @param ctx Pointer to already initialized VLC_Context structure + * @param buf + * @param max_size Maximum number of 32-bit words to output + * @return 0, 1 if more data needs to be output or -1 in case of failure + */ +int DecDCTvlcContinue(VLC_Context *ctx, uint32_t *buf, size_t max_size); + +/** + * A wrapper around DecDCTvlcStart() and DecDCTvlcContinue() for compatibility + * with the official SDK. This function uses an internal context; additionally, + * the maximum output buffer size is not passed as an argument but is instead + * set by calling DecDCTvlcSize(). + * + * This function behaves identically to DecDCTvlcContinue() if bs = 0 and + * DecDCTvlcStart() otherwise. + * + * See DecDCTvlcStart() for more details. + * + * WARNING: InitGeom() must be called prior to using DecDCTvlc() for the first + * time. Attempting to call this function with the GTE disabled will result in + * a crash. + * + * @param bs Pointer to bitstream data or 0 to resume decoding + * @param buf + * @return 0, 1 if more data needs to be output or -1 in case of failure + */ +int DecDCTvlc(const uint32_t *bs, uint32_t *buf); + +/** + * @brief Sets the maximum number of 32-bit words that a single call to + * DecDCTvlc() will output. If size = 0, the entire frame will always be + * decoded in one shot. + * + * @param size Maximum number of 32-bit words to output + * @return Previously set value + */ +size_t DecDCTvlcSize(size_t size); + +/** + * @brief Copies the small (<1 KB) lookup table used by DecDCTvlcContinue(), + * DecDCTvlcStart() and DecDCTvlc() (a DECDCTTAB structure) to the specified + * address. A copy of this table is always present in main RAM, however this + * function can be used to copy it to the scratchpad region to boost + * decompression performance. + * + * The address passed to this function is saved. Calls to DecDCTvlcStart(), + * DecDCTvlcContinue() and DecDCTvlc() will automatically use the last table + * copied. Call DecDCTvlcCopyTable(0) to revert to using the library's internal + * table in main RAM. + * + * @param addr Pointer to free area in scratchpad region or 0 to reset + */ +void DecDCTvlcCopyTable(DECDCTTAB *addr); + +/** + * @brief Begins decompressing the contents of a .BS file (or of a single .STR + * frame) into a buffer that can be passed to DecDCTin(). This function uses a + * large (34 KB) lookup table that must be loaded into main RAM beforehand by + * calling DecDCTvlcBuild(), but does not use the GTE nor the scratchpad. + * Depending on the specific bitstream being decoded DecDCTvlcStart2() might be + * slightly faster or slower than DecDCTvlcStart() with its lookup table copied + * to the scratchpad (see DecDCTvlcCopyTable()). DecDCTvlcStart() with the + * table in main RAM tends to be much slower. + * + * A VLC_Context object must be created and passed to this function, which will + * then proceed to initialize its fields. The max_size argument sets the + * maximum number of words that will be written to the output buffer; if more + * data needs to be written, this function will return 1. To continue decoding + * call DecDCTvlcContinue2() with the same VLC_Context object (the output + * buffer can be different). If max_size = 0, the entire frame will always be + * decoded in one shot. + * + * Only bitstream version 2 is currently supported. + * + * @param ctx Pointer to VLC_Context structure (which will be initialized) + * @param buf + * @param max_size Maximum number of 32-bit words to output + * @param bs + * @return 0, 1 if more data needs to be output or -1 in case of failure + */ +int DecDCTvlcStart2(VLC_Context *ctx, uint32_t *buf, size_t max_size, const uint32_t *bs); + +/** + * @brief Resumes the decompression process started by DecDCTvlcStart2(). The + * state of the decompressor is contained entirely in the VLC_Context structure + * so an arbitrary number of bitstreams can be decoded concurrently (although + * the limited CPU power makes it impractical to do so) by keeping a separate + * context for each bitstream. + * + * This function behaves like DecDCTvlcStart2(), returning 1 if more data has + * to be written or 0 otherwise. DecDCTvlcContinue2() shall not be called after + * a previous call to DecDCTvlcStart2() or DecDCTvlcContinue2() with the same + * context returned 0; in that case the context shall be discarded or reused to + * decode another bitstream. + * + * See DecDCTvlcStart2() for more details. + * + * @param ctx Pointer to already initialized VLC_Context structure + * @param buf + * @param max_size Maximum number of 32-bit words to output + * @return 0, 1 if more data needs to be output or -1 in case of failure + */ +int DecDCTvlcContinue2(VLC_Context *ctx, uint32_t *buf, size_t max_size); + +/** + * A wrapper around DecDCTvlcStart2() and DecDCTvlcContinue2() for + * compatibility with the official SDK. This function uses an internal context; + * additionally, the maximum output buffer size is not passed as an argument + * but is instead set by calling DecDCTvlcSize2(). + * + * This function behaves identically to DecDCTvlcContinue() if bs = 0 and + * DecDCTvlcStart() otherwise. The table argument can optionally be passed to + * use a custom lookup table. If zero, the last pointer passed to + * DecDCTvlcBuild() will be used. + * + * See DecDCTvlcStart2() for more details. + * + * @param bs Pointer to bitstream data or 0 to resume decoding + * @param buf + * @param table Pointer to decompressed table or 0 to use last table used + * @return 0, 1 if more data needs to be output or -1 in case of failure + */ +int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB2 *table); + +/** + * @brief Sets the maximum number of 32-bit words that a single call to + * DecDCTvlc2() will output. If size = 0, the entire frame will always be + * decoded in one shot. + * + * @param size Maximum number of 32-bit words to output + * @return Previously set value + */ +size_t DecDCTvlcSize2(size_t size); + +/** + * @brief Generates the lookup table required by DecDCTvlcStart2(), + * DecDCTvlcContinue2() and DecDCTvlc2() (a DECDCTTAB2 structure) into the + * specified buffer. Since the table is relatively large (34 KB), it is + * recommended to only generate it in a dynamically-allocated buffer when + * needed and deallocate the buffer afterwards. + * + * The address passed to this function is saved. Calls to DecDCTvlcStart2() and + * DecDCTvlcContinue2() will automatically use the last table decompressed. + * + * @param table + */ +void DecDCTvlcBuild(DECDCTTAB2 *table); + #ifdef __cplusplus } #endif diff --git a/libpsn00b/psxpress/generate_lookup_table.py b/libpsn00b/psxpress/generate_lookup_table.py new file mode 100644 index 0000000..b40771f --- /dev/null +++ b/libpsn00b/psxpress/generate_lookup_table.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 +# Huffman lookup table generator script for psxpress +# (C) 2022 spicyjpeg - MPL licensed + +import sys, json +from array import array +from itertools import repeat +from argparse import ArgumentParser, FileType + +HUFFMAN_TREE = { + "10": 0xfe00, # End of block + "11": ( 0, 1 ), + "01": { + "1": ( 1, 1 ), + "00": ( 0, 2 ), + "01": ( 2, 1 ) + }, + "001": { + "01": ( 0, 3 ), + "10": ( 4, 1 ), + "11": ( 3, 1 ), + "00000": ( 13, 1 ), + "00001": ( 0, 6 ), + "00010": ( 12, 1 ), + "00011": ( 11, 1 ), + "00100": ( 3, 2 ), + "00101": ( 1, 3 ), + "00110": ( 0, 5 ), + "00111": ( 10, 1 ) + }, + "0001": { + "00": ( 7, 1 ), + "01": ( 6, 1 ), + "10": ( 1, 2 ), + "11": ( 5, 1 ) + }, + "00001": { + "00": ( 2, 2 ), + "01": ( 9, 1 ), + "10": ( 0, 4 ), + "11": ( 8, 1 ) + }, + "0000001": { + "000": ( 16, 1 ), + "001": ( 5, 2 ), + "010": ( 0, 7 ), + "011": ( 2, 3 ), + "100": ( 1, 4 ), + "101": ( 15, 1 ), + "110": ( 14, 1 ), + "111": ( 4, 2 ) + }, + "00000001": { + "0000": ( 0, 11 ), + "0001": ( 8, 2 ), + "0010": ( 4, 3 ), + "0011": ( 0, 10 ), + "0100": ( 2, 4 ), + "0101": ( 7, 2 ), + "0110": ( 21, 2 ), + "0111": ( 20, 1 ), + "1000": ( 0, 9 ), + "1001": ( 19, 1 ), + "1010": ( 18, 1 ), + "1011": ( 1, 5 ), + "1100": ( 3, 3 ), + "1101": ( 0, 8 ), + "1110": ( 6, 2 ), + "1111": ( 17, 1 ) + }, + "000000001": { + "0000": ( 10, 2 ), + "0001": ( 9, 2 ), + "0010": ( 5, 3 ), + "0011": ( 3, 4 ), + "0100": ( 2, 5 ), + "0101": ( 1, 7 ), + "0110": ( 1, 6 ), + "0111": ( 0, 15 ), + "1000": ( 0, 14 ), + "1001": ( 0, 13 ), + "1010": ( 0, 12 ), + "1011": ( 26, 1 ), + "1100": ( 25, 1 ), + "1101": ( 24, 1 ), + "1110": ( 23, 1 ), + "1111": ( 22, 1 ) + }, + "0000000001": { + "0000": ( 0, 31 ), + "0001": ( 0, 30 ), + "0010": ( 0, 29 ), + "0011": ( 0, 28 ), + "0100": ( 0, 27 ), + "0101": ( 0, 26 ), + "0110": ( 0, 25 ), + "0111": ( 0, 24 ), + "1000": ( 0, 23 ), + "1001": ( 0, 22 ), + "1010": ( 0, 21 ), + "1011": ( 0, 20 ), + "1100": ( 0, 19 ), + "1101": ( 0, 18 ), + "1110": ( 0, 17 ), + "1111": ( 0, 16 ) + }, + "00000000001": { + "0000": ( 0, 40 ), + "0001": ( 0, 39 ), + "0010": ( 0, 38 ), + "0011": ( 0, 37 ), + "0100": ( 0, 36 ), + "0101": ( 0, 35 ), + "0110": ( 0, 34 ), + "0111": ( 0, 33 ), + "1000": ( 0, 32 ), + "1001": ( 1, 14 ), + "1010": ( 1, 13 ), + "1011": ( 1, 12 ), + "1100": ( 1, 11 ), + "1101": ( 1, 10 ), + "1110": ( 1, 9 ), + "1111": ( 1, 8 ) + }, + "000000000001": { + "0000": ( 1, 18 ), + "0001": ( 1, 17 ), + "0010": ( 1, 16 ), + "0011": ( 1, 15 ), + "0100": ( 6, 3 ), + "0101": ( 16, 2 ), + "0110": ( 15, 2 ), + "0111": ( 14, 2 ), + "1000": ( 13, 2 ), + "1001": ( 12, 2 ), + "1010": ( 11, 2 ), + "1011": ( 31, 1 ), + "1100": ( 30, 1 ), + "1101": ( 29, 1 ), + "1110": ( 28, 1 ), + "1111": ( 27, 1 ) + } +} + +## Utilities + +def to_int10(value): + clamped = min(max(int(value), -0x200), 0x1ff) + return clamped + (0 if clamped >= 0 else 0x400) + +def uint32_to_lines(data, indent = "\t", columns = 6): + for offset in range(0, len(data), columns): + line = f"{indent}0x{data[offset]:08x}" + + for item in data[(offset + 1):(offset + columns)]: + line += f", 0x{item:08x}" + + yield line + +## Table generation + +def iterate_tree(tree): + for code, value in tree.items(): + if type(value) is dict: + # Iterate through any subtree recursively. + for suffix, _value in iterate_tree(value): + yield f"{code}{suffix}", _value + + elif type(value) is tuple: + run_length, ac = value + yield f"{code}0", (run_length << 10) | to_int10(ac) + yield f"{code}1", (run_length << 10) | to_int10(-ac) + + else: + yield code, value + +def generate_table(codes, table_bits, prefix_bits = 0): + table = array("I", repeat(0, 2 ** table_bits)) + + for code, value in codes: + used_bits = len(code) + free_bits = table_bits - (used_bits - prefix_bits) + index = int(code[prefix_bits:], 2) << free_bits + + # Fill out every entry in the table whose index starts with the same + # string of bits. + for combo in range(2 ** free_bits): + table[index | combo] = (used_bits << 16) | value + + return table + +def compress_table(table): + values = [] + last_value = table[0] + run_length = 0 + + for value in table[1:]: + if value == last_value and run_length < 0x7ff: + run_length += 1 + continue + + # The run length is stored in the top 11 bits of each value, which are + # otherwise unused. + values.append((run_length << 21) | last_value) + last_value = value + run_length = 0 + + values.append((run_length << 21) | last_value) + return array("I", values) + +## Main + +UNCOMPRESSED_TEMPLATE = """static const DECDCTTAB {name} = {{ + .lut = {{ +{short} + }}, + .lut00 = {{ +{long} + }} +}}; +""" +COMPRESSED_TEMPLATE = """static const uint32_t {name}[{length}] = {{ +{table} +}}; +""" + +def get_args(): + parser = ArgumentParser( + description = "Generates a Huffman lookup table structure, to be used by DecDCTvlc2()." + ) + parser.add_argument( + "-c", "--compress", + action = "store_true", + help = "generate run-length compressed data instead of a DECDCTTAB struct" + ) + parser.add_argument( + "-n", "--name", + type = str, + default = "_default_huffman_table", + help = "set the symbol name in the generated C source", + metavar = "file" + ) + parser.add_argument( + "-t", "--tree", + type = FileType("rt"), + help = "use a custom Huffman tree from the specified JSON file", + metavar = "json_file" + ) + parser.add_argument( + "-o", "--output", + type = FileType("wt"), + default = sys.stdout, + help = "where to output generated table (stdout by default)", + metavar = "file" + ) + + return parser.parse_args() + +def main(): + args = get_args() + tree = json.load(args.tree) if args.tree else HUFFMAN_TREE + + short_codes, short_bits = [], 0 + long_codes, long_bits = [], 0 + + for pair in iterate_tree(tree): + if (code := pair[0]).startswith("00000000"): + long_codes.append(pair) + long_bits = max(long_bits, len(code) - 8) + else: + short_codes.append(pair) + short_bits = max(short_bits, len(code)) + + short_table = generate_table(short_codes, short_bits, 0) + long_table = generate_table(long_codes, long_bits, 8) + + if args.compress: + short_table.extend(long_table) + table = compress_table(short_table) + + source = COMPRESSED_TEMPLATE.format( + name = args.name, + length = len(table), + table = ",\n".join(uint32_to_lines(table, "\t")) + ) + else: + source = UNCOMPRESSED_TEMPLATE.format( + name = args.name, + short = ",\n".join(uint32_to_lines(short_table, "\t\t")), + long = ",\n".join(uint32_to_lines(long_table, "\t\t")) + ) + + with args.output as _file: + _file.write(source) + +if __name__ == "__main__": + main() diff --git a/libpsn00b/psxpress/vlc.c b/libpsn00b/psxpress/vlc.c new file mode 100644 index 0000000..4e3e283 --- /dev/null +++ b/libpsn00b/psxpress/vlc.c @@ -0,0 +1,130 @@ +/* + * PSn00bSDK MDEC library (support code for the main VLC decompressor) + * (C) 2022 spicyjpeg - MPL licensed + */ + +#include <stdint.h> +#include <stddef.h> +#include <string.h> +#include <psxpress.h> + +/* Huffman code lookup table */ + +#define _val1(rl, dc) (((rl) << 10) | ((uint16_t) (dc) & 0x3ff)) +#define _val2(rl, dc, len) (_val1(rl, dc) | (len << 16)) + +#define _pair(rl, dc) _val1(rl, dc), _val1(rl, -(dc)) +#define _pair2(rl, dc, len) _val2(rl, dc, len), _val2(rl, -(dc), len) +#define _pair3(rl, dc, len) \ + _val2(rl, dc, len), _val2(rl, dc, len), \ + _val2(rl, -(dc), len), _val2(rl, -(dc), len) +#define _pair4(rl, dc, len) \ + _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), \ + _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), \ + _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), \ + _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len) + +// This table isn't compressed since it makes no sense to compress less than a +// kilobyte's worth of data. +static const DECDCTTAB _default_huffman_table = { + .lut0 = { + // 11 x + _pair( 0, 1) + }, + .lut2 = { + // 01 0xx + _pair2( 0, 2, 5), _pair2( 2, 1, 5), + // 01 1x- + _pair3( 1, 1, 4) + }, + .lut3 = { + // 001 00xxxx + _pair2(13, 1, 9), _pair2( 0, 6, 9), _pair2(12, 1, 9), _pair2(11, 1, 9), + _pair2( 3, 2, 9), _pair2( 1, 3, 9), _pair2( 0, 5, 9), _pair2(10, 1, 9), + // 001 xxx--- + _pair4( 0, 3, 6), _pair4( 4, 1, 6), _pair4( 3, 1, 6) + }, + .lut4 = { + // 0001 xxx + _pair( 7, 1), _pair( 6, 1), _pair( 1, 2), _pair( 5, 1) + }, + .lut5 = { + // 00001 xxx + _pair( 2, 2), _pair( 9, 1), _pair( 0, 4), _pair( 8, 1) + }, + .lut7 = { + // 0000001 xxxx + _pair(16, 1), _pair( 5, 2), _pair( 0, 7), _pair( 2, 3), + _pair( 1, 4), _pair(15, 1), _pair(14, 1), _pair( 4, 2) + }, + .lut8 = { + // 00000001 xxxxx + _pair( 0, 11), _pair( 8, 2), _pair( 4, 3), _pair( 0, 10), + _pair( 2, 4), _pair( 7, 2), _pair(21, 1), _pair(20, 1), + _pair( 0, 9), _pair(19, 1), _pair(18, 1), _pair( 1, 5), + _pair( 3, 3), _pair( 0, 8), _pair( 6, 2), _pair(17, 1) + }, + .lut9 = { + // 000000001 xxxxx + _pair(10, 2), _pair( 9, 2), _pair( 5, 3), _pair( 3, 4), + _pair( 2, 5), _pair( 1, 7), _pair( 1, 6), _pair( 0, 15), + _pair( 0, 14), _pair( 0, 13), _pair( 0, 12), _pair(26, 1), + _pair(25, 1), _pair(24, 1), _pair(23, 1), _pair(22, 1) + }, + .lut10 = { + // 0000000001 xxxxx + _pair( 0, 31), _pair( 0, 30), _pair( 0, 29), _pair( 0, 28), + _pair( 0, 27), _pair( 0, 26), _pair( 0, 25), _pair( 0, 24), + _pair( 0, 23), _pair( 0, 22), _pair( 0, 21), _pair( 0, 20), + _pair( 0, 19), _pair( 0, 18), _pair( 0, 17), _pair( 0, 16) + }, + .lut11 = { + // 00000000001 xxxxx + _pair( 0, 40), _pair( 0, 39), _pair( 0, 38), _pair( 0, 37), + _pair( 0, 36), _pair( 0, 35), _pair( 0, 34), _pair( 0, 33), + _pair( 0, 32), _pair( 1, 14), _pair( 1, 13), _pair( 1, 12), + _pair( 1, 11), _pair( 1, 10), _pair( 1, 9), _pair( 1, 8) + }, + .lut12 = { + // 000000000001 xxxxx + _pair( 1, 18), _pair( 1, 17), _pair( 1, 16), _pair( 1, 15), + _pair( 6, 3), _pair(16, 2), _pair(15, 2), _pair(14, 2), + _pair(13, 2), _pair(12, 2), _pair(11, 2), _pair(31, 1), + _pair(30, 1), _pair(29, 1), _pair(28, 1), _pair(27, 1) + } +}; + +/* Internal globals */ + +// Note that DecDCTvlc() and DecDCTvlc2() do *not* share the same variables. +static VLC_Context _default_context; +static size_t _max_buffer_size = 0; + +const DECDCTTAB *_vlc_huffman_table = &_default_huffman_table; + +/* Stateful VLC decoder API (for Sony SDK compatibility) */ + +int DecDCTvlc(const uint32_t *bs, uint32_t *buf) { + if (bs) + return DecDCTvlcStart(&_default_context, buf, _max_buffer_size, bs); + else + return DecDCTvlcContinue(&_default_context, buf, _max_buffer_size); +} + +size_t DecDCTvlcSize(size_t size) { + size_t old_size = _max_buffer_size; + _max_buffer_size = size; + + return old_size; +} + +/* Lookup table relocation API */ + +void DecDCTvlcCopyTable(DECDCTTAB *addr) { + if (addr) { + _vlc_huffman_table = addr; + memcpy(addr, &_default_huffman_table, sizeof(DECDCTTAB)); + } else { + _vlc_huffman_table = &_default_huffman_table; + } +} diff --git a/libpsn00b/psxpress/vlc.s b/libpsn00b/psxpress/vlc.s new file mode 100644 index 0000000..fe51642 --- /dev/null +++ b/libpsn00b/psxpress/vlc.s @@ -0,0 +1,404 @@ +# PSn00bSDK MDEC library (GTE-accelerated VLC decompressor) +# (C) 2022 spicyjpeg - MPL licensed +# +# Register map: +# - $a0 = ctx +# - $a1 = output +# - $a2 = max_size +# - $a3 = input +# - $t0 = window +# - $t1 = next_window +# - $t2 = remaining +# - $t3 = quant_scale +# - $t4 = is_v3 +# - $t5 = bit_offset +# - $t6 = block_index +# - $t7 = coeff_index +# - $t8 = _vlc_huffman_table +# - $t9 = &ac_jump_area + +.set noreorder + +.set VLC_Context_input, 0 +.set VLC_Context_window, 4 +.set VLC_Context_next_window, 8 +.set VLC_Context_remaining, 12 +.set VLC_Context_quant_scale, 16 +.set VLC_Context_is_v3, 18 +.set VLC_Context_bit_offset, 19 +.set VLC_Context_block_index, 20 +.set VLC_Context_coeff_index, 21 + +.set DECDCTSMALLTAB_lut0, 0 +.set DECDCTSMALLTAB_lut2, 4 +.set DECDCTSMALLTAB_lut3, 36 +.set DECDCTSMALLTAB_lut4, 292 +.set DECDCTSMALLTAB_lut5, 308 +.set DECDCTSMALLTAB_lut7, 324 +.set DECDCTSMALLTAB_lut8, 356 +.set DECDCTSMALLTAB_lut9, 420 +.set DECDCTSMALLTAB_lut10, 484 +.set DECDCTSMALLTAB_lut11, 548 +.set DECDCTSMALLTAB_lut12, 612 + +.section .text.DecDCTvlcStart +.global DecDCTvlcStart +.type DecDCTvlcStart, @function +DecDCTvlcStart: + # Create a new context on-the-fly without writing it to memory then jump + # into DecDCTvlcContinue(), skipping context loading. + lw $t0, 8($a3) # window = (bs->data[0] << 16) | (bs->data[0] >> 16) + nop + srl $v0, $t0, 16 + sll $t0, 16 + + lw $t1, 12($a3) # next_window = (bs->data[1] << 16) | (bs->data[1] >> 16) + or $t0, $v0 + srl $v0, $t1, 16 + sll $t1, 16 + + lhu $t2, 0($a3) # remaining = bs->uncomp_length * 2 + or $t1, $v0 + + lhu $t3, 4($a3) # quant_scale = (bs->quant_scale & 63) << 10 + sll $t2, 1 + andi $t3, 63 + + lhu $t4, 6($a3) # is_v3 = !(bs->version < 3) + sll $t3, 10 + sltiu $t4, $t4, 3 + xori $t4, 1 + + li $t5, 32 # bit_offset = 32 + li $t6, 5 # block_index = 5 + li $t7, 0 # coeff_index = 0 + j _vlc_skip_context_load + addiu $a3, 16 # input = &(bs->data[2]) + +.section .text.DecDCTvlcContinue +.global DecDCTvlcContinue +.type DecDCTvlcContinue, @function +DecDCTvlcContinue: + lw $a3, VLC_Context_input($a0) + lw $t0, VLC_Context_window($a0) + lw $t1, VLC_Context_next_window($a0) + lw $t2, VLC_Context_remaining($a0) + lhu $t3, VLC_Context_quant_scale($a0) + lb $t4, VLC_Context_is_v3($a0) + lb $t5, VLC_Context_bit_offset($a0) + lb $t6, VLC_Context_block_index($a0) + lb $t7, VLC_Context_coeff_index($a0) + +_vlc_skip_context_load: + # Determine how many bytes to output. This whole block of code basically + # does this: + # max_size = min((max_size - 1) * 2, remaining) + # remaining -= max_size + bgtz $a2, .Lmax_size_valid # if (max_size <= 0) max_size = 0x7ffe0000 + addiu $a2, -1 # else max_size = (max_size - 1) * 2 + lui $a2, 0x3fff +.Lmax_size_valid: + sll $a2, 1 + + blt $a2, $t2, .Lmax_size_ok # if (max_size > remaining) max_size = remaining + lui $v1, 0x3800 + move $a2, $t2 +.Lmax_size_ok: + subu $t2, $a2 # remaining -= max_size + + # Write the length of the data that will be decoded to first 4 bytes of the + # output buffer, which will be then parsed by DecDCTin(). + srl $v0, $a2, 1 # output[0] = 0x38000000 | (max_size / 2) + or $v0, $v1 + sw $v0, 0($a1) + + # Obtain the addresses of the lookup table and jump area in advance so that + # they don't have to be retrieved for each coefficient decoded. + lw $t8, _vlc_huffman_table + la $t9, .Lac_jump_area + + beqz $a2, .Lstop_processing + addiu $a1, 4 # output = (uint16_t *) &output[1] + +.Lprocess_next_code_loop: # while (max_size) + # This is the "hot" part of the decoder, executed for each code in the + # bitstream. The first step is to determine if the next code is a DC or AC + # coefficient. The GTE is also given the task of counting the number of + # leading zeroes/ones, which takes 2 more cycles. + bnez $t7, .Lprocess_ac_coefficient + mtc2 $t0, $30 + bnez $t4, .Lprocess_dc_v3_coefficient + #nop + +.Lprocess_dc_v2_coefficient: # if (!coeff_index && !is_v3) + # The DC coefficient in version 2 frames is not compressed. + srl $v0, $t0, 22 # *output = (window >> (32 - 10)) | quant_scale + or $v0, $t3 + addiu $t7, 1 # coeff_index++ + sll $t0, 10 # window <<= 10 + addiu $t5, -10 # bit_offset -= 10 + b .Lwrite_value + addiu $t7, 1 # coeff_index++ + +.Lprocess_dc_v3_coefficient: # if (!coeff_index && is_v3) + # TODO: version 3 is currently not supported. + jr $ra + li $v0, -1 + #b .Lwrite_value + +.Lprocess_ac_coefficient: # if (coeff_index) + # Check whether the prefix code is one of the shorter, more common ones. + srl $v0, $t0, 30 + li $v1, 3 + beq $v0, $v1, .Lac_prefix_11 + li $v1, 2 + beq $v0, $v1, .Lac_prefix_10 + li $v1, 1 + beq $v0, $v1, .Lac_prefix_01 + #srl $v0, $t0, 29 + #beq $v0, $v1, .Lac_prefix_001 + #nop + + # If the code is longer, retrieve the number of leading zeroes from the GTE + # and use it as an index into the jump area. Each block in the area is 8 + # instructions long and handles decoding a specific prefix. + mfc2 $v0, $31 + nop + andi $v0, 15 # jump_addr = &ac_jump_area[(prefix % 16) * 8 * sizeof(u32)] + sll $v0, 5 + addu $v0, $t9 + jr $v0 + nop + +.Lac_prefix_11: + # Prefix 11 is followed by a single bit. + srl $v0, $t0, 28 # index = ((window >> (32 - 2 - 1)) & 1) * sizeof(u16) + andi $v0, 2 + addu $v0, $t8 # value = table->lut0[index] + lhu $v0, DECDCTSMALLTAB_lut0($v0) + sll $t0, 3 # window <<= 3 + addiu $t5, -3 # bit_offset -= 3 + b .Lwrite_value + addiu $t7, 1 # coeff_index++ + +.Lac_jump_area: +.Lac_prefix_10: + # Prefix 10 marks the end of a block. + li $v0, 0xfe00 # value = 0xfe00 + sll $t0, 2 # window <<= 2 + addiu $t5, -2 # bit_offset -= 2 + addiu $t6, -1 # block_index-- + bgez $t6, .Lwrite_value + li $t7, 0 # coeff_index = 0 + b .Lwrite_value + li $t6, 5 # if (block_index < 0) block_index = 5 + +.Lac_prefix_01: + # Prefix 01 can be followed by a 2-bit lookup index starting with 1, or a + # 3-bit lookup index starting with 0. A 32-bit lookup table is used, + # containing both MDEC codes and lengths. + srl $v0, $t0, 25 # index = ((window >> (32 - 2 - 3)) & 7) * sizeof(u32) + andi $v0, 28 + addu $v0, $t8 # value = table->lut2[index] + lw $v0, DECDCTSMALLTAB_lut2($v0) + addiu $t7, 1 # coeff_index++ + b .Lupdate_window_and_write + srl $v1, $v0, 16 # length = value >> 16 + .word 0 + +.Lac_prefix_001: + # Prefix 001 can be followed by a 6-bit lookup index starting with 00, or a + # 3-bit lookup index starting with 01/10/11. + srl $v0, $t0, 21 # index = ((window >> (32 - 3 - 6)) & 63) * sizeof(u32) + andi $v0, 252 + addu $v0, $t8 # value = table->lut3[index] + lw $v0, DECDCTSMALLTAB_lut3($v0) + addiu $t7, 1 # coeff_index++ + b .Lupdate_window_and_write + srl $v1, $v0, 16 # length = value >> 16 + .word 0 + +.Lac_prefix_0001: + # Prefix 0001 is followed by a 3-bit lookup index. + srl $v0, $t0, 24 # index = ((window >> (32 - 4 - 3)) & 7) * sizeof(u16) + andi $v0, 14 + addu $v0, $t8 # value = table->lut4[index] + lhu $v0, DECDCTSMALLTAB_lut4($v0) + sll $t0, 7 # window <<= 4 + 3 + addiu $t5, -7 # bit_offset -= 4 + 3 + b .Lwrite_value + addiu $t7, 1 # coeff_index++ + +.Lac_prefix_00001: + # Prefix 00001 is followed by a 3-bit lookup index. + srl $v0, $t0, 23 # index = ((window >> (32 - 5 - 3)) & 7) * sizeof(u16) + andi $v0, 14 + addu $v0, $t8 # value = table->lut5[index] + lhu $v0, DECDCTSMALLTAB_lut5($v0) + sll $t0, 8 # window <<= 5 + 3 + addiu $t5, -8 # bit_offset -= 5 + 3 + b .Lwrite_value + addiu $t7, 1 # coeff_index++ + +.Lac_prefix_000001: + # Prefix 000001 is an escape code followed by a full 16-bit MDEC value. + srl $v0, $t0, 10 # value = window >> (32 - 6 - 16) + sll $t0, 22 # window <<= 6 + 16 + addiu $t5, -22 # bit_offset -= 6 + 16 + b .Lwrite_value + addiu $t7, 1 # coeff_index++ + .word 0, 0, 0 + +.Lac_prefix_0000001: + # Prefix 0000001 is followed by a 4-bit lookup index. + srl $v0, $t0, 20 # index = ((window >> (32 - 7 - 4)) & 15) * sizeof(u16) + andi $v0, 30 + addu $v0, $t8 # value = table->lut7[index] + lhu $v0, DECDCTSMALLTAB_lut7($v0) + sll $t0, 11 # window <<= 7 + 4 + addiu $t5, -11 # bit_offset -= 7 + 4 + b .Lwrite_value + addiu $t7, 1 # coeff_index++ + +.Lac_prefix_00000001: + # Prefix 00000001 is followed by a 5-bit lookup index. + srl $v0, $t0, 18 # index = ((window >> (32 - 8 - 5)) & 31) * sizeof(u16) + andi $v0, 62 + addu $v0, $t8 # value = table->lut8[index] + lhu $v0, DECDCTSMALLTAB_lut8($v0) + sll $t0, 13 # window <<= 8 + 5 + addiu $t5, -13 # bit_offset -= 8 + 5 + b .Lwrite_value + addiu $t7, 1 # coeff_index++ + +.Lac_prefix_000000001: + # Prefix 000000001 is followed by a 5-bit lookup index. + srl $v0, $t0, 17 # index = ((window >> (32 - 9 - 5)) & 31) * sizeof(u16) + andi $v0, 62 + addu $v0, $t8 # value = table->lut9[index] + lhu $v0, DECDCTSMALLTAB_lut9($v0) + sll $t0, 14 # window <<= 9 + 5 + addiu $t5, -14 # bit_offset -= 9 + 5 + b .Lwrite_value + addiu $t7, 1 # coeff_index++ + +.Lac_prefix_0000000001: + # Prefix 0000000001 is followed by a 5-bit lookup index. + srl $v0, $t0, 16 # index = ((window >> (32 - 10 - 5)) & 31) * sizeof(u16) + andi $v0, 62 + addu $v0, $t8 # value = table->lut10[index] + lhu $v0, DECDCTSMALLTAB_lut10($v0) + sll $t0, 15 # window <<= 10 + 5 + addiu $t5, -15 # bit_offset -= 10 + 5 + b .Lwrite_value + addiu $t7, 1 # coeff_index++ + +.Lac_prefix_00000000001: + # Prefix 00000000001 is followed by a 5-bit lookup index. + srl $v0, $t0, 15 # index = ((window >> (32 - 11 - 5)) & 31) * sizeof(u16) + andi $v0, 62 + addu $v0, $t8 # value = table->lut11[index] + lhu $v0, DECDCTSMALLTAB_lut11($v0) + sll $t0, 16 # window <<= 11 + 5 + addiu $t5, -16 # bit_offset -= 11 + 5 + b .Lwrite_value + addiu $t7, 1 # coeff_index++ + +.Lac_prefix_000000000001: + # Prefix 000000000001 is followed by a 5-bit lookup index. + srl $v0, $t0, 14 # index = ((window >> (32 - 12 - 5)) & 31) * sizeof(u16) + andi $v0, 62 + addu $v0, $t8 # value = table->lut12[index] + lhu $v0, DECDCTSMALLTAB_lut12($v0) + sll $t0, 17 # window <<= 12 + 5 + addiu $t5, -17 # bit_offset -= 12 + 5 + b .Lwrite_value + addiu $t7, 1 # coeff_index++ + + # Prefix 0000000000001 is not valid. + beqz $t0, .Lstop_processing + nop + jr $ra + li $v0, -1 + .word 0, 0, 0, 0 + + # Prefix 00000000000001 is not valid. + beqz $t0, .Lstop_processing + nop + jr $ra + li $v0, -1 + .word 0, 0, 0, 0 + + # Prefix 000000000000001 is not valid. + beqz $t0, .Lstop_processing + nop + jr $ra + li $v0, -1 + .word 0, 0, 0, 0 + + # Prefix 0000000000000001 is not valid. + beqz $t0, .Lstop_processing + nop + jr $ra + li $v0, -1 + #.word 0, 0, 0, 0 + +.Lupdate_window_and_write: + sllv $t0, $t0, $v1 # window <<= length + subu $t5, $v1 # bit_offset -= length +.Lwrite_value: + sh $v0, 0($a1) +.Lfeed_bitstream: + # Update the window. This makes sure the next iteration of the loop will be + # able to read up to 32 bits from the bitstream. + bgez $t5, .Lskip_feeding # if (bit_offset < 0) + addiu $a2, -1 # max_size-- + + subu $v0, $0, $t5 # window = next_window << (-bit_offset) + sllv $t0, $t1, $v0 + lw $t1, 0($a3) # next_window = (*input << 16) | (*input >> 16) + addiu $t5, 32 # bit_offset += 32 + srl $v0, $t1, 16 + sll $t1, 16 + or $t1, $v0 + addiu $a3, 4 # input++ + +.Lskip_feeding: + srlv $v0, $t1, $t5 # window |= next_window >> bit_offset + or $t0, $v0 + + bnez $a2, .Lprocess_next_code_loop + addiu $a1, 2 # output++ + +.Lstop_processing: + # If remaining = 0, skip flushing the context, pad the output buffer with + # end-of-block codes if necessary and return 0. Otherwise flush the context + # and return 1. + beqz $t2, .Lpad_output_buffer + nop + + sw $a3, VLC_Context_input($a0) + sw $t0, VLC_Context_window($a0) + sw $t1, VLC_Context_next_window($a0) + sw $t2, VLC_Context_remaining($a0) + sh $t3, VLC_Context_quant_scale($a0) + sb $t4, VLC_Context_is_v3($a0) + sb $t5, VLC_Context_bit_offset($a0) + sb $t6, VLC_Context_block_index($a0) + sb $t7, VLC_Context_coeff_index($a0) + + jr $ra + li $v0, 1 + +.Lpad_output_buffer: + beqz $a2, .Lreturn_zero + li $v0, 0xfe00 +.Lpad_output_buffer_loop: # while (max_size) + sh $v0, 0($a1) # *output = 0xfe00 + addiu $a2, -1 # max_size-- + bnez $a2, .Lpad_output_buffer_loop + addiu $a1, 2 # output++ + +.Lreturn_zero: + jr $ra + li $v0, 0 diff --git a/libpsn00b/psxpress/vlc2.c b/libpsn00b/psxpress/vlc2.c new file mode 100644 index 0000000..73b54b2 --- /dev/null +++ b/libpsn00b/psxpress/vlc2.c @@ -0,0 +1,240 @@ +/* + * PSn00bSDK MDEC library (alternate VLC decompressor and support code) + * (C) 2022 spicyjpeg - MPL licensed + */ + +#include <stdint.h> +#include <stddef.h> +#include <psxpress.h> + +#define _min(x, y) (((x) < (y)) ? (x) : (y)) + +/* Huffman code lookup table */ + +#define TABLE_LENGTH 226 + +// This table is run-length compressed, with the number of repetitions of each +// value stored in the upper 11 bits which would be otherwise unused. It is +// decompressed at runtime by DecDCTvlcBuild(). +static const uint32_t _compressed_table[TABLE_LENGTH] = { + 0x03e00000, 0x000d000b, 0x000d03f5, 0x000d2002, 0x000d23fe, 0x000d1003, + 0x000d13fd, 0x000d000a, 0x000d03f6, 0x000d0804, 0x000d0bfc, 0x000d1c02, + 0x000d1ffe, 0x000d5402, 0x000d57fe, 0x000d5001, 0x000d53ff, 0x000d0009, + 0x000d03f7, 0x000d4c01, 0x000d4fff, 0x000d4801, 0x000d4bff, 0x000d0405, + 0x000d07fb, 0x000d0c03, 0x000d0ffd, 0x000d0008, 0x000d03f8, 0x000d1802, + 0x000d1bfe, 0x000d4401, 0x000d47ff, 0x006b4001, 0x006b43ff, 0x006b1402, + 0x006b17fe, 0x006b0007, 0x006b03f9, 0x006b0803, 0x006b0bfd, 0x006b0404, + 0x006b07fc, 0x006b3c01, 0x006b3fff, 0x006b3801, 0x006b3bff, 0x006b1002, + 0x006b13fe, 0x0fe00000, 0x03e80802, 0x03e80bfe, 0x03e82401, 0x03e827ff, + 0x03e80004, 0x03e803fc, 0x03e82001, 0x03e823ff, 0x07e71c01, 0x07e71fff, + 0x07e71801, 0x07e71bff, 0x07e70402, 0x07e707fe, 0x07e71401, 0x07e717ff, + 0x01e93401, 0x01e937ff, 0x01e90006, 0x01e903fa, 0x01e93001, 0x01e933ff, + 0x01e92c01, 0x01e92fff, 0x01e90c02, 0x01e90ffe, 0x01e90403, 0x01e907fd, + 0x01e90005, 0x01e903fb, 0x01e92801, 0x01e92bff, 0x0fe60003, 0x0fe603fd, + 0x0fe61001, 0x0fe613ff, 0x0fe60c01, 0x0fe60fff, 0x1fe50002, 0x1fe503fe, + 0x1fe50801, 0x1fe50bff, 0x3fe40401, 0x3fe407ff, 0xffe2fe00, 0x7fe30001, + 0x7fe303ff, 0x03e00000, 0x00110412, 0x001107ee, 0x00110411, 0x001107ef, + 0x00110410, 0x001107f0, 0x0011040f, 0x001107f1, 0x00111803, 0x00111bfd, + 0x00114002, 0x001143fe, 0x00113c02, 0x00113ffe, 0x00113802, 0x00113bfe, + 0x00113402, 0x001137fe, 0x00113002, 0x001133fe, 0x00112c02, 0x00112ffe, + 0x00117c01, 0x00117fff, 0x00117801, 0x00117bff, 0x00117401, 0x001177ff, + 0x00117001, 0x001173ff, 0x00116c01, 0x00116fff, 0x00300028, 0x003003d8, + 0x00300027, 0x003003d9, 0x00300026, 0x003003da, 0x00300025, 0x003003db, + 0x00300024, 0x003003dc, 0x00300023, 0x003003dd, 0x00300022, 0x003003de, + 0x00300021, 0x003003df, 0x00300020, 0x003003e0, 0x0030040e, 0x003007f2, + 0x0030040d, 0x003007f3, 0x0030040c, 0x003007f4, 0x0030040b, 0x003007f5, + 0x0030040a, 0x003007f6, 0x00300409, 0x003007f7, 0x00300408, 0x003007f8, + 0x006f001f, 0x006f03e1, 0x006f001e, 0x006f03e2, 0x006f001d, 0x006f03e3, + 0x006f001c, 0x006f03e4, 0x006f001b, 0x006f03e5, 0x006f001a, 0x006f03e6, + 0x006f0019, 0x006f03e7, 0x006f0018, 0x006f03e8, 0x006f0017, 0x006f03e9, + 0x006f0016, 0x006f03ea, 0x006f0015, 0x006f03eb, 0x006f0014, 0x006f03ec, + 0x006f0013, 0x006f03ed, 0x006f0012, 0x006f03ee, 0x006f0011, 0x006f03ef, + 0x006f0010, 0x006f03f0, 0x00ee2802, 0x00ee2bfe, 0x00ee2402, 0x00ee27fe, + 0x00ee1403, 0x00ee17fd, 0x00ee0c04, 0x00ee0ffc, 0x00ee0805, 0x00ee0bfb, + 0x00ee0407, 0x00ee07f9, 0x00ee0406, 0x00ee07fa, 0x00ee000f, 0x00ee03f1, + 0x00ee000e, 0x00ee03f2, 0x00ee000d, 0x00ee03f3, 0x00ee000c, 0x00ee03f4, + 0x00ee6801, 0x00ee6bff, 0x00ee6401, 0x00ee67ff, 0x00ee6001, 0x00ee63ff, + 0x00ee5c01, 0x00ee5fff, 0x00ee5801, 0x00ee5bff +}; + +/* Internal globals */ + +// Note that DecDCTvlc() and DecDCTvlc2() do *not* share the same variables. +static VLC_Context _default_context; +static size_t _max_buffer_size = 0; + +const DECDCTTAB2 *_vlc_huffman_table2 = 0; + +/* VLC decoder */ + +#define _get_bits_unsigned(length) (((uint32_t) window) >> (32 - (length))) +#define _get_bits_signed(length) (((int32_t) window) >> (32 - (length))) +#define _advance_window(num) \ + window <<= (num); \ + bit_offset -= (num); + +int __attribute__((optimize(3))) DecDCTvlcContinue2( + VLC_Context *ctx, uint32_t *buf, size_t max_size +) { + const uint32_t *input = ctx->input; + uint32_t remaining = ctx->remaining; + uint32_t window = ctx->window; + uint32_t next_window = ctx->next_window; + uint16_t quant_scale = ctx->quant_scale; + int block_index = ctx->block_index; + int coeff_index = ctx->coeff_index; + int bit_offset = ctx->bit_offset; + int is_v3 = ctx->is_v3; + + //if (!_vlc_huffman_table2) + //return -1; + if (!max_size) + max_size = 0x7fffffff; + + // Write the length of the data that will be decoded to first 4 bytes of + // the output buffer, which will be then parsed by DecDCTin(). + max_size = _min((max_size - 1) * 2, remaining); + remaining -= max_size; + + *buf = 0x38000000 | (max_size / 2); + uint16_t *output = (uint16_t *) &buf[1]; + + for (; max_size; max_size--) { + uint32_t value; + + if (coeff_index) { + // Parse the next AC coefficient. Most codes are decompressed via + // the lookup table, however some need special handling. + if ((window >> 30) == 0b10) { + // Prefix 10 marks the end of a block. + *output = 0xfe00; + _advance_window(2); + + coeff_index = -1; + block_index++; + if (block_index > 5) + block_index = 0; + } else if ((window >> 26) == 0b000001) { + // Prefix 000001 is an escape code followed by a full 16-bit + // MDEC value. + *output = (uint16_t) _get_bits_unsigned(22); + _advance_window(22); + } else if (window >> 24) { + // The first lookup table is for codes that not start with + // 00000000. + value = _vlc_huffman_table2->lut[_get_bits_unsigned(13)]; + _advance_window(value >> 16); + *output = (uint16_t) value; + } else { + // If the code starts with 00000000, use the second lookup + // table. + value = _vlc_huffman_table2->lut00[_get_bits_unsigned(17)]; + _advance_window(value >> 16); + *output = (uint16_t) value; + } + } else { + // Parse the DC (first) coefficient for this block. Version 2 + // simply stores the signed 10-bit value as-is, while version 3 + // uses a delta encoding combined with a compression method similar + // to exp-Golomb. + if (is_v3) { + // TODO: version 3 is currently not supported. + return -1; + } else { + value = _get_bits_unsigned(10); + *output = value | quant_scale; + _advance_window(10); + } + } + + output++; + coeff_index++; + + // Update the bitstream window. For whatever reason Sony's DecDCTvlc() + // implementation inefficiently reads the input stream 16 bits at a + // time and processes each 16-bit word starting from the the MSB, so an + // endianness conversion is necessary to preserve bit order when + // reading 32 bits at a time. Also note that the PS1 CPU is not capable + // of shifting by more than 31 bits - it will shift by 0 bits instead! + if (bit_offset < 0) { + window = next_window << (-bit_offset); + bit_offset += 32; + next_window = (*input << 16) | (*input >> 16); + input++; + }; + window |= next_window >> bit_offset; + } + + // Pad the buffer with end-of-block codes if necessary. + for (; max_size; max_size--) + *(output++) = 0xfe00; + + if (!remaining) + return 0; + + ctx->input = input; + ctx->remaining = remaining; + ctx->window = window; + ctx->next_window = next_window; + ctx->block_index = block_index; + ctx->coeff_index = coeff_index; + ctx->bit_offset = bit_offset; + return 1; +} + +int DecDCTvlcStart2( + VLC_Context *ctx, uint32_t *buf, size_t max_size, const uint32_t *bs +) { + const BS_Header *header = (const BS_Header *) bs; + const uint32_t *input = (const uint32_t *) &header[1]; + + if (!_vlc_huffman_table2) + return -1; + if (header->version > 3) + return -1; + + ctx->input = &input[2]; + ctx->remaining = (header->mdec0_header & 0xffff) * 2; + ctx->window = (input[0] << 16) | (input[0] >> 16); + ctx->next_window = (input[1] << 16) | (input[1] >> 16); + ctx->quant_scale = (header->quant_scale & 63) << 10; + ctx->block_index = 0; + ctx->coeff_index = 0; + ctx->bit_offset = 32; + ctx->is_v3 = (header->version == 3); + + return DecDCTvlcContinue2(ctx, buf, max_size); +} + +/* Stateful VLC decoder API (for Sony SDK compatibility) */ + +int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB2 *table) { + if (table) + _vlc_huffman_table2 = table; + + if (bs) + return DecDCTvlcStart2(&_default_context, buf, _max_buffer_size, bs); + else + return DecDCTvlcContinue2(&_default_context, buf, _max_buffer_size); +} + +size_t DecDCTvlcSize2(size_t size) { + size_t old_size = _max_buffer_size; + _max_buffer_size = size; + + return old_size; +} + +/* Lookup table decompressor */ + +void DecDCTvlcBuild(DECDCTTAB2 *table) { + uint32_t *output = (uint32_t *) table; + _vlc_huffman_table2 = table; + + for (int i = 0; i < TABLE_LENGTH; i++) { + uint32_t value = _compressed_table[i] & 0x001fffff; + + for (int j = (_compressed_table[i] >> 21); j >= 0; j--) + *(output++) = value; + } +} |
