aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorspicyjpeg <thatspicyjpeg@gmail.com>2022-07-17 21:33:35 +0200
committerspicyjpeg <thatspicyjpeg@gmail.com>2022-07-17 21:33:35 +0200
commit5b63607ba4ca12c2a2935ea9618b3ffe6a6d3ab3 (patch)
treef7ec51b65eb148d4fbddca5127589592e57d219c
parentc800972bc13ad0c7015b7d44fe9f124b719e792e (diff)
downloadpsn00bsdk-5b63607ba4ca12c2a2935ea9618b3ffe6a6d3ab3.tar.gz
Add experimental psxpress Huffman decoding API
-rw-r--r--libpsn00b/include/psxpress.h250
-rw-r--r--libpsn00b/psxpress/generate_lookup_table.py297
-rw-r--r--libpsn00b/psxpress/vlc.c130
-rw-r--r--libpsn00b/psxpress/vlc.s404
-rw-r--r--libpsn00b/psxpress/vlc2.c240
5 files changed, 1313 insertions, 8 deletions
diff --git a/libpsn00b/include/psxpress.h b/libpsn00b/include/psxpress.h
index 6203c2a..2106a53 100644
--- a/libpsn00b/include/psxpress.h
+++ b/libpsn00b/include/psxpress.h
@@ -17,6 +17,28 @@ typedef struct _DECDCTENV {
int16_t dct[64]; // Inverse DCT matrix (2.14 fixed-point)
} DECDCTENV;
+// This is the "small" lookup table used by DecDCTvlc(). It can be copied to
+// the scratchpad.
+typedef struct _DECDCTTAB {
+ uint16_t lut0[2];
+ uint32_t lut2[8];
+ uint32_t lut3[64];
+ uint16_t lut4[8];
+ uint16_t lut5[8];
+ uint16_t lut7[16];
+ uint16_t lut8[32];
+ uint16_t lut9[32];
+ uint16_t lut10[32];
+ uint16_t lut11[32];
+ uint16_t lut12[32];
+} DECDCTTAB;
+
+// This is the "large" table used by DecDCTvlc2().
+typedef struct _DECDCTTAB2 {
+ uint32_t lut[8192];
+ uint32_t lut00[512];
+} DECDCTTAB2;
+
typedef enum _DECDCTMODE {
DECDCT_MODE_24BPP = 1,
DECDCT_MODE_16BPP = 0,
@@ -24,6 +46,23 @@ typedef enum _DECDCTMODE {
DECDCT_MODE_RAW = -1
} DECDCTMODE;
+typedef struct _VLC_Context {
+ const uint32_t *input;
+ uint32_t window, next_window, remaining;
+ uint16_t quant_scale;
+ int8_t is_v3, bit_offset, block_index, coeff_index;
+} VLC_Context;
+
+// Despite what some docs claim, the "number of 32-byte blocks" and "always
+// 0x3800" fields are actually a single 32-bit field which is copied over to
+// the output buffer, then parsed by DecDCTin() and written to the MDEC0
+// register.
+typedef struct {
+ uint32_t mdec0_header;
+ uint16_t quant_scale;
+ uint16_t version;
+} BS_Header;
+
/* Public API */
#ifdef __cplusplus
@@ -64,12 +103,12 @@ void DecDCTReset(int mode);
void DecDCTPutEnv(const DECDCTENV *env, int mono);
/**
- * @brief Sets up the MDEC to start fetching and decoding a stream from the
- * given address in main RAM. The first 32-bit word is initially copied to the
- * MDEC0 register, then all subsequent data is read in 128-byte (32-word)
- * chunks. The length of the stream (in 32-bit units, minus the first word)
- * must be encoded in the lower 16 bits of the first word, as expected by the
- * MDEC.
+ * @brief Sets up the MDEC to start fetching and decoding the given buffer.
+ * This function is meant to be used with buffers generated by DecDCTvlc(): the
+ * first 32-bit word of the buffer is initially copied to the MDEC0 register,
+ * then all subsequent data is read in 128-byte (32-word) chunks. The length of
+ * the stream (in 32-bit units, minus the first word) is encoded by DecDCTvlc()
+ * in the lower 16 bits of the first word.
*
* The mode argument optionally specifies the output color depth (0 for 16bpp,
* 1 for 24bpp) if not already set in the first word. Passing -1 will result in
@@ -111,7 +150,7 @@ void DecDCTinRaw(const uint32_t *data, size_t length);
* stream (usually a whole frame) is being written to main RAM.
*
* @param mode
- * @return 0 or -1 in case of a timeout (mode = 0) / MDEC busy flag (mode = 1)
+ * @return 0 or -1 in case of a timeout (mode = 0), MDEC busy flag (mode = 1)
*/
int DecDCTinSync(int mode);
@@ -142,10 +181,205 @@ void DecDCTout(uint32_t *data, size_t length);
* to register a callback that calls DecDCTin() to feed the MDEC.
*
* @param mode
- * @return 0 or -1 in case of a timeout (mode = 0) / DMA busy flag (mode = 1)
+ * @return 0 or -1 in case of a timeout (mode = 0), DMA busy flag (mode = 1)
*/
int DecDCToutSync(int mode);
+/**
+ * @brief Begins decompressing the contents of a .BS file (or of a single .STR
+ * frame) into a buffer that can be passed to DecDCTin(). This function uses a
+ * small (<1 KB) lookup table combined with the GTE to accelerate the process;
+ * performance is roughly on par with DecDCTvlcStart2() if the lookup table
+ * is copied to the scratchpad beforehand by calling DecDCTvlcCopyTable(). The
+ * contents of the GTE's LZCR register, if any, will be destroyed.
+ *
+ * A VLC_Context object must be created and passed to this function, which will
+ * then proceed to initialize its fields. The max_size argument sets the
+ * maximum number of words that will be written to the output buffer; if more
+ * data needs to be written, this function will return 1. To continue decoding
+ * call DecDCTvlcContinue() with the same VLC_Context object (the output buffer
+ * can be different). If max_size = 0, the entire frame will always be decoded
+ * in one shot.
+ *
+ * Only bitstream version 2 is currently supported.
+ *
+ * WARNING: InitGeom() must be called prior to using DecDCTvlcStart() for the
+ * first time. Attempting to call this function with the GTE disabled will
+ * result in a crash.
+ *
+ * @param ctx Pointer to VLC_Context structure (which will be initialized)
+ * @param buf
+ * @param max_size Maximum number of 32-bit words to output
+ * @param bs
+ * @return 0, 1 if more data needs to be output or -1 in case of failure
+ */
+int DecDCTvlcStart(VLC_Context *ctx, uint32_t *buf, size_t max_size, const uint32_t *bs);
+
+/**
+ * @brief Resumes the decompression process started by DecDCTvlcStart(). The
+ * state of the decompressor is contained entirely in the VLC_Context structure
+ * so an arbitrary number of bitstreams can be decoded concurrently (although
+ * the limited CPU power makes it impractical to do so) by keeping a separate
+ * context for each bitstream.
+ *
+ * This function behaves like DecDCTvlcStart(), returning 1 if more data has to
+ * be written or 0 otherwise. DecDCTvlcContinue() shall not be called after a
+ * previous call to DecDCTvlcStart() or DecDCTvlcContinue() with the same
+ * context returned 0; in that case the context shall be discarded or reused to
+ * decode another bitstream.
+ *
+ * The contents of the GTE's LZCR register, if any, will be destroyed.
+ *
+ * See DecDCTvlcStart() for more details.
+ *
+ * @param ctx Pointer to already initialized VLC_Context structure
+ * @param buf
+ * @param max_size Maximum number of 32-bit words to output
+ * @return 0, 1 if more data needs to be output or -1 in case of failure
+ */
+int DecDCTvlcContinue(VLC_Context *ctx, uint32_t *buf, size_t max_size);
+
+/**
+ * A wrapper around DecDCTvlcStart() and DecDCTvlcContinue() for compatibility
+ * with the official SDK. This function uses an internal context; additionally,
+ * the maximum output buffer size is not passed as an argument but is instead
+ * set by calling DecDCTvlcSize().
+ *
+ * This function behaves identically to DecDCTvlcContinue() if bs = 0 and
+ * DecDCTvlcStart() otherwise.
+ *
+ * See DecDCTvlcStart() for more details.
+ *
+ * WARNING: InitGeom() must be called prior to using DecDCTvlc() for the first
+ * time. Attempting to call this function with the GTE disabled will result in
+ * a crash.
+ *
+ * @param bs Pointer to bitstream data or 0 to resume decoding
+ * @param buf
+ * @return 0, 1 if more data needs to be output or -1 in case of failure
+ */
+int DecDCTvlc(const uint32_t *bs, uint32_t *buf);
+
+/**
+ * @brief Sets the maximum number of 32-bit words that a single call to
+ * DecDCTvlc() will output. If size = 0, the entire frame will always be
+ * decoded in one shot.
+ *
+ * @param size Maximum number of 32-bit words to output
+ * @return Previously set value
+ */
+size_t DecDCTvlcSize(size_t size);
+
+/**
+ * @brief Copies the small (<1 KB) lookup table used by DecDCTvlcContinue(),
+ * DecDCTvlcStart() and DecDCTvlc() (a DECDCTTAB structure) to the specified
+ * address. A copy of this table is always present in main RAM, however this
+ * function can be used to copy it to the scratchpad region to boost
+ * decompression performance.
+ *
+ * The address passed to this function is saved. Calls to DecDCTvlcStart(),
+ * DecDCTvlcContinue() and DecDCTvlc() will automatically use the last table
+ * copied. Call DecDCTvlcCopyTable(0) to revert to using the library's internal
+ * table in main RAM.
+ *
+ * @param addr Pointer to free area in scratchpad region or 0 to reset
+ */
+void DecDCTvlcCopyTable(DECDCTTAB *addr);
+
+/**
+ * @brief Begins decompressing the contents of a .BS file (or of a single .STR
+ * frame) into a buffer that can be passed to DecDCTin(). This function uses a
+ * large (34 KB) lookup table that must be loaded into main RAM beforehand by
+ * calling DecDCTvlcBuild(), but does not use the GTE nor the scratchpad.
+ * Depending on the specific bitstream being decoded DecDCTvlcStart2() might be
+ * slightly faster or slower than DecDCTvlcStart() with its lookup table copied
+ * to the scratchpad (see DecDCTvlcCopyTable()). DecDCTvlcStart() with the
+ * table in main RAM tends to be much slower.
+ *
+ * A VLC_Context object must be created and passed to this function, which will
+ * then proceed to initialize its fields. The max_size argument sets the
+ * maximum number of words that will be written to the output buffer; if more
+ * data needs to be written, this function will return 1. To continue decoding
+ * call DecDCTvlcContinue2() with the same VLC_Context object (the output
+ * buffer can be different). If max_size = 0, the entire frame will always be
+ * decoded in one shot.
+ *
+ * Only bitstream version 2 is currently supported.
+ *
+ * @param ctx Pointer to VLC_Context structure (which will be initialized)
+ * @param buf
+ * @param max_size Maximum number of 32-bit words to output
+ * @param bs
+ * @return 0, 1 if more data needs to be output or -1 in case of failure
+ */
+int DecDCTvlcStart2(VLC_Context *ctx, uint32_t *buf, size_t max_size, const uint32_t *bs);
+
+/**
+ * @brief Resumes the decompression process started by DecDCTvlcStart2(). The
+ * state of the decompressor is contained entirely in the VLC_Context structure
+ * so an arbitrary number of bitstreams can be decoded concurrently (although
+ * the limited CPU power makes it impractical to do so) by keeping a separate
+ * context for each bitstream.
+ *
+ * This function behaves like DecDCTvlcStart2(), returning 1 if more data has
+ * to be written or 0 otherwise. DecDCTvlcContinue2() shall not be called after
+ * a previous call to DecDCTvlcStart2() or DecDCTvlcContinue2() with the same
+ * context returned 0; in that case the context shall be discarded or reused to
+ * decode another bitstream.
+ *
+ * See DecDCTvlcStart2() for more details.
+ *
+ * @param ctx Pointer to already initialized VLC_Context structure
+ * @param buf
+ * @param max_size Maximum number of 32-bit words to output
+ * @return 0, 1 if more data needs to be output or -1 in case of failure
+ */
+int DecDCTvlcContinue2(VLC_Context *ctx, uint32_t *buf, size_t max_size);
+
+/**
+ * A wrapper around DecDCTvlcStart2() and DecDCTvlcContinue2() for
+ * compatibility with the official SDK. This function uses an internal context;
+ * additionally, the maximum output buffer size is not passed as an argument
+ * but is instead set by calling DecDCTvlcSize2().
+ *
+ * This function behaves identically to DecDCTvlcContinue() if bs = 0 and
+ * DecDCTvlcStart() otherwise. The table argument can optionally be passed to
+ * use a custom lookup table. If zero, the last pointer passed to
+ * DecDCTvlcBuild() will be used.
+ *
+ * See DecDCTvlcStart2() for more details.
+ *
+ * @param bs Pointer to bitstream data or 0 to resume decoding
+ * @param buf
+ * @param table Pointer to decompressed table or 0 to use last table used
+ * @return 0, 1 if more data needs to be output or -1 in case of failure
+ */
+int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB2 *table);
+
+/**
+ * @brief Sets the maximum number of 32-bit words that a single call to
+ * DecDCTvlc2() will output. If size = 0, the entire frame will always be
+ * decoded in one shot.
+ *
+ * @param size Maximum number of 32-bit words to output
+ * @return Previously set value
+ */
+size_t DecDCTvlcSize2(size_t size);
+
+/**
+ * @brief Generates the lookup table required by DecDCTvlcStart2(),
+ * DecDCTvlcContinue2() and DecDCTvlc2() (a DECDCTTAB2 structure) into the
+ * specified buffer. Since the table is relatively large (34 KB), it is
+ * recommended to only generate it in a dynamically-allocated buffer when
+ * needed and deallocate the buffer afterwards.
+ *
+ * The address passed to this function is saved. Calls to DecDCTvlcStart2() and
+ * DecDCTvlcContinue2() will automatically use the last table decompressed.
+ *
+ * @param table
+ */
+void DecDCTvlcBuild(DECDCTTAB2 *table);
+
#ifdef __cplusplus
}
#endif
diff --git a/libpsn00b/psxpress/generate_lookup_table.py b/libpsn00b/psxpress/generate_lookup_table.py
new file mode 100644
index 0000000..b40771f
--- /dev/null
+++ b/libpsn00b/psxpress/generate_lookup_table.py
@@ -0,0 +1,297 @@
+#!/usr/bin/env python3
+# Huffman lookup table generator script for psxpress
+# (C) 2022 spicyjpeg - MPL licensed
+
+import sys, json
+from array import array
+from itertools import repeat
+from argparse import ArgumentParser, FileType
+
+HUFFMAN_TREE = {
+ "10": 0xfe00, # End of block
+ "11": ( 0, 1 ),
+ "01": {
+ "1": ( 1, 1 ),
+ "00": ( 0, 2 ),
+ "01": ( 2, 1 )
+ },
+ "001": {
+ "01": ( 0, 3 ),
+ "10": ( 4, 1 ),
+ "11": ( 3, 1 ),
+ "00000": ( 13, 1 ),
+ "00001": ( 0, 6 ),
+ "00010": ( 12, 1 ),
+ "00011": ( 11, 1 ),
+ "00100": ( 3, 2 ),
+ "00101": ( 1, 3 ),
+ "00110": ( 0, 5 ),
+ "00111": ( 10, 1 )
+ },
+ "0001": {
+ "00": ( 7, 1 ),
+ "01": ( 6, 1 ),
+ "10": ( 1, 2 ),
+ "11": ( 5, 1 )
+ },
+ "00001": {
+ "00": ( 2, 2 ),
+ "01": ( 9, 1 ),
+ "10": ( 0, 4 ),
+ "11": ( 8, 1 )
+ },
+ "0000001": {
+ "000": ( 16, 1 ),
+ "001": ( 5, 2 ),
+ "010": ( 0, 7 ),
+ "011": ( 2, 3 ),
+ "100": ( 1, 4 ),
+ "101": ( 15, 1 ),
+ "110": ( 14, 1 ),
+ "111": ( 4, 2 )
+ },
+ "00000001": {
+ "0000": ( 0, 11 ),
+ "0001": ( 8, 2 ),
+ "0010": ( 4, 3 ),
+ "0011": ( 0, 10 ),
+ "0100": ( 2, 4 ),
+ "0101": ( 7, 2 ),
+ "0110": ( 21, 2 ),
+ "0111": ( 20, 1 ),
+ "1000": ( 0, 9 ),
+ "1001": ( 19, 1 ),
+ "1010": ( 18, 1 ),
+ "1011": ( 1, 5 ),
+ "1100": ( 3, 3 ),
+ "1101": ( 0, 8 ),
+ "1110": ( 6, 2 ),
+ "1111": ( 17, 1 )
+ },
+ "000000001": {
+ "0000": ( 10, 2 ),
+ "0001": ( 9, 2 ),
+ "0010": ( 5, 3 ),
+ "0011": ( 3, 4 ),
+ "0100": ( 2, 5 ),
+ "0101": ( 1, 7 ),
+ "0110": ( 1, 6 ),
+ "0111": ( 0, 15 ),
+ "1000": ( 0, 14 ),
+ "1001": ( 0, 13 ),
+ "1010": ( 0, 12 ),
+ "1011": ( 26, 1 ),
+ "1100": ( 25, 1 ),
+ "1101": ( 24, 1 ),
+ "1110": ( 23, 1 ),
+ "1111": ( 22, 1 )
+ },
+ "0000000001": {
+ "0000": ( 0, 31 ),
+ "0001": ( 0, 30 ),
+ "0010": ( 0, 29 ),
+ "0011": ( 0, 28 ),
+ "0100": ( 0, 27 ),
+ "0101": ( 0, 26 ),
+ "0110": ( 0, 25 ),
+ "0111": ( 0, 24 ),
+ "1000": ( 0, 23 ),
+ "1001": ( 0, 22 ),
+ "1010": ( 0, 21 ),
+ "1011": ( 0, 20 ),
+ "1100": ( 0, 19 ),
+ "1101": ( 0, 18 ),
+ "1110": ( 0, 17 ),
+ "1111": ( 0, 16 )
+ },
+ "00000000001": {
+ "0000": ( 0, 40 ),
+ "0001": ( 0, 39 ),
+ "0010": ( 0, 38 ),
+ "0011": ( 0, 37 ),
+ "0100": ( 0, 36 ),
+ "0101": ( 0, 35 ),
+ "0110": ( 0, 34 ),
+ "0111": ( 0, 33 ),
+ "1000": ( 0, 32 ),
+ "1001": ( 1, 14 ),
+ "1010": ( 1, 13 ),
+ "1011": ( 1, 12 ),
+ "1100": ( 1, 11 ),
+ "1101": ( 1, 10 ),
+ "1110": ( 1, 9 ),
+ "1111": ( 1, 8 )
+ },
+ "000000000001": {
+ "0000": ( 1, 18 ),
+ "0001": ( 1, 17 ),
+ "0010": ( 1, 16 ),
+ "0011": ( 1, 15 ),
+ "0100": ( 6, 3 ),
+ "0101": ( 16, 2 ),
+ "0110": ( 15, 2 ),
+ "0111": ( 14, 2 ),
+ "1000": ( 13, 2 ),
+ "1001": ( 12, 2 ),
+ "1010": ( 11, 2 ),
+ "1011": ( 31, 1 ),
+ "1100": ( 30, 1 ),
+ "1101": ( 29, 1 ),
+ "1110": ( 28, 1 ),
+ "1111": ( 27, 1 )
+ }
+}
+
+## Utilities
+
+def to_int10(value):
+ clamped = min(max(int(value), -0x200), 0x1ff)
+ return clamped + (0 if clamped >= 0 else 0x400)
+
+def uint32_to_lines(data, indent = "\t", columns = 6):
+ for offset in range(0, len(data), columns):
+ line = f"{indent}0x{data[offset]:08x}"
+
+ for item in data[(offset + 1):(offset + columns)]:
+ line += f", 0x{item:08x}"
+
+ yield line
+
+## Table generation
+
+def iterate_tree(tree):
+ for code, value in tree.items():
+ if type(value) is dict:
+ # Iterate through any subtree recursively.
+ for suffix, _value in iterate_tree(value):
+ yield f"{code}{suffix}", _value
+
+ elif type(value) is tuple:
+ run_length, ac = value
+ yield f"{code}0", (run_length << 10) | to_int10(ac)
+ yield f"{code}1", (run_length << 10) | to_int10(-ac)
+
+ else:
+ yield code, value
+
+def generate_table(codes, table_bits, prefix_bits = 0):
+ table = array("I", repeat(0, 2 ** table_bits))
+
+ for code, value in codes:
+ used_bits = len(code)
+ free_bits = table_bits - (used_bits - prefix_bits)
+ index = int(code[prefix_bits:], 2) << free_bits
+
+ # Fill out every entry in the table whose index starts with the same
+ # string of bits.
+ for combo in range(2 ** free_bits):
+ table[index | combo] = (used_bits << 16) | value
+
+ return table
+
+def compress_table(table):
+ values = []
+ last_value = table[0]
+ run_length = 0
+
+ for value in table[1:]:
+ if value == last_value and run_length < 0x7ff:
+ run_length += 1
+ continue
+
+ # The run length is stored in the top 11 bits of each value, which are
+ # otherwise unused.
+ values.append((run_length << 21) | last_value)
+ last_value = value
+ run_length = 0
+
+ values.append((run_length << 21) | last_value)
+ return array("I", values)
+
+## Main
+
+UNCOMPRESSED_TEMPLATE = """static const DECDCTTAB {name} = {{
+ .lut = {{
+{short}
+ }},
+ .lut00 = {{
+{long}
+ }}
+}};
+"""
+COMPRESSED_TEMPLATE = """static const uint32_t {name}[{length}] = {{
+{table}
+}};
+"""
+
+def get_args():
+ parser = ArgumentParser(
+ description = "Generates a Huffman lookup table structure, to be used by DecDCTvlc2()."
+ )
+ parser.add_argument(
+ "-c", "--compress",
+ action = "store_true",
+ help = "generate run-length compressed data instead of a DECDCTTAB struct"
+ )
+ parser.add_argument(
+ "-n", "--name",
+ type = str,
+ default = "_default_huffman_table",
+ help = "set the symbol name in the generated C source",
+ metavar = "file"
+ )
+ parser.add_argument(
+ "-t", "--tree",
+ type = FileType("rt"),
+ help = "use a custom Huffman tree from the specified JSON file",
+ metavar = "json_file"
+ )
+ parser.add_argument(
+ "-o", "--output",
+ type = FileType("wt"),
+ default = sys.stdout,
+ help = "where to output generated table (stdout by default)",
+ metavar = "file"
+ )
+
+ return parser.parse_args()
+
+def main():
+ args = get_args()
+ tree = json.load(args.tree) if args.tree else HUFFMAN_TREE
+
+ short_codes, short_bits = [], 0
+ long_codes, long_bits = [], 0
+
+ for pair in iterate_tree(tree):
+ if (code := pair[0]).startswith("00000000"):
+ long_codes.append(pair)
+ long_bits = max(long_bits, len(code) - 8)
+ else:
+ short_codes.append(pair)
+ short_bits = max(short_bits, len(code))
+
+ short_table = generate_table(short_codes, short_bits, 0)
+ long_table = generate_table(long_codes, long_bits, 8)
+
+ if args.compress:
+ short_table.extend(long_table)
+ table = compress_table(short_table)
+
+ source = COMPRESSED_TEMPLATE.format(
+ name = args.name,
+ length = len(table),
+ table = ",\n".join(uint32_to_lines(table, "\t"))
+ )
+ else:
+ source = UNCOMPRESSED_TEMPLATE.format(
+ name = args.name,
+ short = ",\n".join(uint32_to_lines(short_table, "\t\t")),
+ long = ",\n".join(uint32_to_lines(long_table, "\t\t"))
+ )
+
+ with args.output as _file:
+ _file.write(source)
+
+if __name__ == "__main__":
+ main()
diff --git a/libpsn00b/psxpress/vlc.c b/libpsn00b/psxpress/vlc.c
new file mode 100644
index 0000000..4e3e283
--- /dev/null
+++ b/libpsn00b/psxpress/vlc.c
@@ -0,0 +1,130 @@
+/*
+ * PSn00bSDK MDEC library (support code for the main VLC decompressor)
+ * (C) 2022 spicyjpeg - MPL licensed
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#include <psxpress.h>
+
+/* Huffman code lookup table */
+
+#define _val1(rl, dc) (((rl) << 10) | ((uint16_t) (dc) & 0x3ff))
+#define _val2(rl, dc, len) (_val1(rl, dc) | (len << 16))
+
+#define _pair(rl, dc) _val1(rl, dc), _val1(rl, -(dc))
+#define _pair2(rl, dc, len) _val2(rl, dc, len), _val2(rl, -(dc), len)
+#define _pair3(rl, dc, len) \
+ _val2(rl, dc, len), _val2(rl, dc, len), \
+ _val2(rl, -(dc), len), _val2(rl, -(dc), len)
+#define _pair4(rl, dc, len) \
+ _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), \
+ _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), _val2(rl, dc, len), \
+ _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), \
+ _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len), _val2(rl, -(dc), len)
+
+// This table isn't compressed since it makes no sense to compress less than a
+// kilobyte's worth of data.
+static const DECDCTTAB _default_huffman_table = {
+ .lut0 = {
+ // 11 x
+ _pair( 0, 1)
+ },
+ .lut2 = {
+ // 01 0xx
+ _pair2( 0, 2, 5), _pair2( 2, 1, 5),
+ // 01 1x-
+ _pair3( 1, 1, 4)
+ },
+ .lut3 = {
+ // 001 00xxxx
+ _pair2(13, 1, 9), _pair2( 0, 6, 9), _pair2(12, 1, 9), _pair2(11, 1, 9),
+ _pair2( 3, 2, 9), _pair2( 1, 3, 9), _pair2( 0, 5, 9), _pair2(10, 1, 9),
+ // 001 xxx---
+ _pair4( 0, 3, 6), _pair4( 4, 1, 6), _pair4( 3, 1, 6)
+ },
+ .lut4 = {
+ // 0001 xxx
+ _pair( 7, 1), _pair( 6, 1), _pair( 1, 2), _pair( 5, 1)
+ },
+ .lut5 = {
+ // 00001 xxx
+ _pair( 2, 2), _pair( 9, 1), _pair( 0, 4), _pair( 8, 1)
+ },
+ .lut7 = {
+ // 0000001 xxxx
+ _pair(16, 1), _pair( 5, 2), _pair( 0, 7), _pair( 2, 3),
+ _pair( 1, 4), _pair(15, 1), _pair(14, 1), _pair( 4, 2)
+ },
+ .lut8 = {
+ // 00000001 xxxxx
+ _pair( 0, 11), _pair( 8, 2), _pair( 4, 3), _pair( 0, 10),
+ _pair( 2, 4), _pair( 7, 2), _pair(21, 1), _pair(20, 1),
+ _pair( 0, 9), _pair(19, 1), _pair(18, 1), _pair( 1, 5),
+ _pair( 3, 3), _pair( 0, 8), _pair( 6, 2), _pair(17, 1)
+ },
+ .lut9 = {
+ // 000000001 xxxxx
+ _pair(10, 2), _pair( 9, 2), _pair( 5, 3), _pair( 3, 4),
+ _pair( 2, 5), _pair( 1, 7), _pair( 1, 6), _pair( 0, 15),
+ _pair( 0, 14), _pair( 0, 13), _pair( 0, 12), _pair(26, 1),
+ _pair(25, 1), _pair(24, 1), _pair(23, 1), _pair(22, 1)
+ },
+ .lut10 = {
+ // 0000000001 xxxxx
+ _pair( 0, 31), _pair( 0, 30), _pair( 0, 29), _pair( 0, 28),
+ _pair( 0, 27), _pair( 0, 26), _pair( 0, 25), _pair( 0, 24),
+ _pair( 0, 23), _pair( 0, 22), _pair( 0, 21), _pair( 0, 20),
+ _pair( 0, 19), _pair( 0, 18), _pair( 0, 17), _pair( 0, 16)
+ },
+ .lut11 = {
+ // 00000000001 xxxxx
+ _pair( 0, 40), _pair( 0, 39), _pair( 0, 38), _pair( 0, 37),
+ _pair( 0, 36), _pair( 0, 35), _pair( 0, 34), _pair( 0, 33),
+ _pair( 0, 32), _pair( 1, 14), _pair( 1, 13), _pair( 1, 12),
+ _pair( 1, 11), _pair( 1, 10), _pair( 1, 9), _pair( 1, 8)
+ },
+ .lut12 = {
+ // 000000000001 xxxxx
+ _pair( 1, 18), _pair( 1, 17), _pair( 1, 16), _pair( 1, 15),
+ _pair( 6, 3), _pair(16, 2), _pair(15, 2), _pair(14, 2),
+ _pair(13, 2), _pair(12, 2), _pair(11, 2), _pair(31, 1),
+ _pair(30, 1), _pair(29, 1), _pair(28, 1), _pair(27, 1)
+ }
+};
+
+/* Internal globals */
+
+// Note that DecDCTvlc() and DecDCTvlc2() do *not* share the same variables.
+static VLC_Context _default_context;
+static size_t _max_buffer_size = 0;
+
+const DECDCTTAB *_vlc_huffman_table = &_default_huffman_table;
+
+/* Stateful VLC decoder API (for Sony SDK compatibility) */
+
+int DecDCTvlc(const uint32_t *bs, uint32_t *buf) {
+ if (bs)
+ return DecDCTvlcStart(&_default_context, buf, _max_buffer_size, bs);
+ else
+ return DecDCTvlcContinue(&_default_context, buf, _max_buffer_size);
+}
+
+size_t DecDCTvlcSize(size_t size) {
+ size_t old_size = _max_buffer_size;
+ _max_buffer_size = size;
+
+ return old_size;
+}
+
+/* Lookup table relocation API */
+
+void DecDCTvlcCopyTable(DECDCTTAB *addr) {
+ if (addr) {
+ _vlc_huffman_table = addr;
+ memcpy(addr, &_default_huffman_table, sizeof(DECDCTTAB));
+ } else {
+ _vlc_huffman_table = &_default_huffman_table;
+ }
+}
diff --git a/libpsn00b/psxpress/vlc.s b/libpsn00b/psxpress/vlc.s
new file mode 100644
index 0000000..fe51642
--- /dev/null
+++ b/libpsn00b/psxpress/vlc.s
@@ -0,0 +1,404 @@
+# PSn00bSDK MDEC library (GTE-accelerated VLC decompressor)
+# (C) 2022 spicyjpeg - MPL licensed
+#
+# Register map:
+# - $a0 = ctx
+# - $a1 = output
+# - $a2 = max_size
+# - $a3 = input
+# - $t0 = window
+# - $t1 = next_window
+# - $t2 = remaining
+# - $t3 = quant_scale
+# - $t4 = is_v3
+# - $t5 = bit_offset
+# - $t6 = block_index
+# - $t7 = coeff_index
+# - $t8 = _vlc_huffman_table
+# - $t9 = &ac_jump_area
+
+.set noreorder
+
+.set VLC_Context_input, 0
+.set VLC_Context_window, 4
+.set VLC_Context_next_window, 8
+.set VLC_Context_remaining, 12
+.set VLC_Context_quant_scale, 16
+.set VLC_Context_is_v3, 18
+.set VLC_Context_bit_offset, 19
+.set VLC_Context_block_index, 20
+.set VLC_Context_coeff_index, 21
+
+.set DECDCTSMALLTAB_lut0, 0
+.set DECDCTSMALLTAB_lut2, 4
+.set DECDCTSMALLTAB_lut3, 36
+.set DECDCTSMALLTAB_lut4, 292
+.set DECDCTSMALLTAB_lut5, 308
+.set DECDCTSMALLTAB_lut7, 324
+.set DECDCTSMALLTAB_lut8, 356
+.set DECDCTSMALLTAB_lut9, 420
+.set DECDCTSMALLTAB_lut10, 484
+.set DECDCTSMALLTAB_lut11, 548
+.set DECDCTSMALLTAB_lut12, 612
+
+.section .text.DecDCTvlcStart
+.global DecDCTvlcStart
+.type DecDCTvlcStart, @function
+DecDCTvlcStart:
+ # Create a new context on-the-fly without writing it to memory then jump
+ # into DecDCTvlcContinue(), skipping context loading.
+ lw $t0, 8($a3) # window = (bs->data[0] << 16) | (bs->data[0] >> 16)
+ nop
+ srl $v0, $t0, 16
+ sll $t0, 16
+
+ lw $t1, 12($a3) # next_window = (bs->data[1] << 16) | (bs->data[1] >> 16)
+ or $t0, $v0
+ srl $v0, $t1, 16
+ sll $t1, 16
+
+ lhu $t2, 0($a3) # remaining = bs->uncomp_length * 2
+ or $t1, $v0
+
+ lhu $t3, 4($a3) # quant_scale = (bs->quant_scale & 63) << 10
+ sll $t2, 1
+ andi $t3, 63
+
+ lhu $t4, 6($a3) # is_v3 = !(bs->version < 3)
+ sll $t3, 10
+ sltiu $t4, $t4, 3
+ xori $t4, 1
+
+ li $t5, 32 # bit_offset = 32
+ li $t6, 5 # block_index = 5
+ li $t7, 0 # coeff_index = 0
+ j _vlc_skip_context_load
+ addiu $a3, 16 # input = &(bs->data[2])
+
+.section .text.DecDCTvlcContinue
+.global DecDCTvlcContinue
+.type DecDCTvlcContinue, @function
+DecDCTvlcContinue:
+ lw $a3, VLC_Context_input($a0)
+ lw $t0, VLC_Context_window($a0)
+ lw $t1, VLC_Context_next_window($a0)
+ lw $t2, VLC_Context_remaining($a0)
+ lhu $t3, VLC_Context_quant_scale($a0)
+ lb $t4, VLC_Context_is_v3($a0)
+ lb $t5, VLC_Context_bit_offset($a0)
+ lb $t6, VLC_Context_block_index($a0)
+ lb $t7, VLC_Context_coeff_index($a0)
+
+_vlc_skip_context_load:
+ # Determine how many bytes to output. This whole block of code basically
+ # does this:
+ # max_size = min((max_size - 1) * 2, remaining)
+ # remaining -= max_size
+ bgtz $a2, .Lmax_size_valid # if (max_size <= 0) max_size = 0x7ffe0000
+ addiu $a2, -1 # else max_size = (max_size - 1) * 2
+ lui $a2, 0x3fff
+.Lmax_size_valid:
+ sll $a2, 1
+
+ blt $a2, $t2, .Lmax_size_ok # if (max_size > remaining) max_size = remaining
+ lui $v1, 0x3800
+ move $a2, $t2
+.Lmax_size_ok:
+ subu $t2, $a2 # remaining -= max_size
+
+ # Write the length of the data that will be decoded to first 4 bytes of the
+ # output buffer, which will be then parsed by DecDCTin().
+ srl $v0, $a2, 1 # output[0] = 0x38000000 | (max_size / 2)
+ or $v0, $v1
+ sw $v0, 0($a1)
+
+ # Obtain the addresses of the lookup table and jump area in advance so that
+ # they don't have to be retrieved for each coefficient decoded.
+ lw $t8, _vlc_huffman_table
+ la $t9, .Lac_jump_area
+
+ beqz $a2, .Lstop_processing
+ addiu $a1, 4 # output = (uint16_t *) &output[1]
+
+.Lprocess_next_code_loop: # while (max_size)
+ # This is the "hot" part of the decoder, executed for each code in the
+ # bitstream. The first step is to determine if the next code is a DC or AC
+ # coefficient. The GTE is also given the task of counting the number of
+ # leading zeroes/ones, which takes 2 more cycles.
+ bnez $t7, .Lprocess_ac_coefficient
+ mtc2 $t0, $30
+ bnez $t4, .Lprocess_dc_v3_coefficient
+ #nop
+
+.Lprocess_dc_v2_coefficient: # if (!coeff_index && !is_v3)
+ # The DC coefficient in version 2 frames is not compressed.
+ srl $v0, $t0, 22 # *output = (window >> (32 - 10)) | quant_scale
+ or $v0, $t3
+ addiu $t7, 1 # coeff_index++
+ sll $t0, 10 # window <<= 10
+ addiu $t5, -10 # bit_offset -= 10
+ b .Lwrite_value
+ addiu $t7, 1 # coeff_index++
+
+.Lprocess_dc_v3_coefficient: # if (!coeff_index && is_v3)
+ # TODO: version 3 is currently not supported.
+ jr $ra
+ li $v0, -1
+ #b .Lwrite_value
+
+.Lprocess_ac_coefficient: # if (coeff_index)
+ # Check whether the prefix code is one of the shorter, more common ones.
+ srl $v0, $t0, 30
+ li $v1, 3
+ beq $v0, $v1, .Lac_prefix_11
+ li $v1, 2
+ beq $v0, $v1, .Lac_prefix_10
+ li $v1, 1
+ beq $v0, $v1, .Lac_prefix_01
+ #srl $v0, $t0, 29
+ #beq $v0, $v1, .Lac_prefix_001
+ #nop
+
+ # If the code is longer, retrieve the number of leading zeroes from the GTE
+ # and use it as an index into the jump area. Each block in the area is 8
+ # instructions long and handles decoding a specific prefix.
+ mfc2 $v0, $31
+ nop
+ andi $v0, 15 # jump_addr = &ac_jump_area[(prefix % 16) * 8 * sizeof(u32)]
+ sll $v0, 5
+ addu $v0, $t9
+ jr $v0
+ nop
+
+.Lac_prefix_11:
+ # Prefix 11 is followed by a single bit.
+ srl $v0, $t0, 28 # index = ((window >> (32 - 2 - 1)) & 1) * sizeof(u16)
+ andi $v0, 2
+ addu $v0, $t8 # value = table->lut0[index]
+ lhu $v0, DECDCTSMALLTAB_lut0($v0)
+ sll $t0, 3 # window <<= 3
+ addiu $t5, -3 # bit_offset -= 3
+ b .Lwrite_value
+ addiu $t7, 1 # coeff_index++
+
+.Lac_jump_area:
+.Lac_prefix_10:
+ # Prefix 10 marks the end of a block.
+ li $v0, 0xfe00 # value = 0xfe00
+ sll $t0, 2 # window <<= 2
+ addiu $t5, -2 # bit_offset -= 2
+ addiu $t6, -1 # block_index--
+ bgez $t6, .Lwrite_value
+ li $t7, 0 # coeff_index = 0
+ b .Lwrite_value
+ li $t6, 5 # if (block_index < 0) block_index = 5
+
+.Lac_prefix_01:
+ # Prefix 01 can be followed by a 2-bit lookup index starting with 1, or a
+ # 3-bit lookup index starting with 0. A 32-bit lookup table is used,
+ # containing both MDEC codes and lengths.
+ srl $v0, $t0, 25 # index = ((window >> (32 - 2 - 3)) & 7) * sizeof(u32)
+ andi $v0, 28
+ addu $v0, $t8 # value = table->lut2[index]
+ lw $v0, DECDCTSMALLTAB_lut2($v0)
+ addiu $t7, 1 # coeff_index++
+ b .Lupdate_window_and_write
+ srl $v1, $v0, 16 # length = value >> 16
+ .word 0
+
+.Lac_prefix_001:
+ # Prefix 001 can be followed by a 6-bit lookup index starting with 00, or a
+ # 3-bit lookup index starting with 01/10/11.
+ srl $v0, $t0, 21 # index = ((window >> (32 - 3 - 6)) & 63) * sizeof(u32)
+ andi $v0, 252
+ addu $v0, $t8 # value = table->lut3[index]
+ lw $v0, DECDCTSMALLTAB_lut3($v0)
+ addiu $t7, 1 # coeff_index++
+ b .Lupdate_window_and_write
+ srl $v1, $v0, 16 # length = value >> 16
+ .word 0
+
+.Lac_prefix_0001:
+ # Prefix 0001 is followed by a 3-bit lookup index.
+ srl $v0, $t0, 24 # index = ((window >> (32 - 4 - 3)) & 7) * sizeof(u16)
+ andi $v0, 14
+ addu $v0, $t8 # value = table->lut4[index]
+ lhu $v0, DECDCTSMALLTAB_lut4($v0)
+ sll $t0, 7 # window <<= 4 + 3
+ addiu $t5, -7 # bit_offset -= 4 + 3
+ b .Lwrite_value
+ addiu $t7, 1 # coeff_index++
+
+.Lac_prefix_00001:
+ # Prefix 00001 is followed by a 3-bit lookup index.
+ srl $v0, $t0, 23 # index = ((window >> (32 - 5 - 3)) & 7) * sizeof(u16)
+ andi $v0, 14
+ addu $v0, $t8 # value = table->lut5[index]
+ lhu $v0, DECDCTSMALLTAB_lut5($v0)
+ sll $t0, 8 # window <<= 5 + 3
+ addiu $t5, -8 # bit_offset -= 5 + 3
+ b .Lwrite_value
+ addiu $t7, 1 # coeff_index++
+
+.Lac_prefix_000001:
+ # Prefix 000001 is an escape code followed by a full 16-bit MDEC value.
+ srl $v0, $t0, 10 # value = window >> (32 - 6 - 16)
+ sll $t0, 22 # window <<= 6 + 16
+ addiu $t5, -22 # bit_offset -= 6 + 16
+ b .Lwrite_value
+ addiu $t7, 1 # coeff_index++
+ .word 0, 0, 0
+
+.Lac_prefix_0000001:
+ # Prefix 0000001 is followed by a 4-bit lookup index.
+ srl $v0, $t0, 20 # index = ((window >> (32 - 7 - 4)) & 15) * sizeof(u16)
+ andi $v0, 30
+ addu $v0, $t8 # value = table->lut7[index]
+ lhu $v0, DECDCTSMALLTAB_lut7($v0)
+ sll $t0, 11 # window <<= 7 + 4
+ addiu $t5, -11 # bit_offset -= 7 + 4
+ b .Lwrite_value
+ addiu $t7, 1 # coeff_index++
+
+.Lac_prefix_00000001:
+ # Prefix 00000001 is followed by a 5-bit lookup index.
+ srl $v0, $t0, 18 # index = ((window >> (32 - 8 - 5)) & 31) * sizeof(u16)
+ andi $v0, 62
+ addu $v0, $t8 # value = table->lut8[index]
+ lhu $v0, DECDCTSMALLTAB_lut8($v0)
+ sll $t0, 13 # window <<= 8 + 5
+ addiu $t5, -13 # bit_offset -= 8 + 5
+ b .Lwrite_value
+ addiu $t7, 1 # coeff_index++
+
+.Lac_prefix_000000001:
+ # Prefix 000000001 is followed by a 5-bit lookup index.
+ srl $v0, $t0, 17 # index = ((window >> (32 - 9 - 5)) & 31) * sizeof(u16)
+ andi $v0, 62
+ addu $v0, $t8 # value = table->lut9[index]
+ lhu $v0, DECDCTSMALLTAB_lut9($v0)
+ sll $t0, 14 # window <<= 9 + 5
+ addiu $t5, -14 # bit_offset -= 9 + 5
+ b .Lwrite_value
+ addiu $t7, 1 # coeff_index++
+
+.Lac_prefix_0000000001:
+ # Prefix 0000000001 is followed by a 5-bit lookup index.
+ srl $v0, $t0, 16 # index = ((window >> (32 - 10 - 5)) & 31) * sizeof(u16)
+ andi $v0, 62
+ addu $v0, $t8 # value = table->lut10[index]
+ lhu $v0, DECDCTSMALLTAB_lut10($v0)
+ sll $t0, 15 # window <<= 10 + 5
+ addiu $t5, -15 # bit_offset -= 10 + 5
+ b .Lwrite_value
+ addiu $t7, 1 # coeff_index++
+
+.Lac_prefix_00000000001:
+ # Prefix 00000000001 is followed by a 5-bit lookup index.
+ srl $v0, $t0, 15 # index = ((window >> (32 - 11 - 5)) & 31) * sizeof(u16)
+ andi $v0, 62
+ addu $v0, $t8 # value = table->lut11[index]
+ lhu $v0, DECDCTSMALLTAB_lut11($v0)
+ sll $t0, 16 # window <<= 11 + 5
+ addiu $t5, -16 # bit_offset -= 11 + 5
+ b .Lwrite_value
+ addiu $t7, 1 # coeff_index++
+
+.Lac_prefix_000000000001:
+ # Prefix 000000000001 is followed by a 5-bit lookup index.
+ srl $v0, $t0, 14 # index = ((window >> (32 - 12 - 5)) & 31) * sizeof(u16)
+ andi $v0, 62
+ addu $v0, $t8 # value = table->lut12[index]
+ lhu $v0, DECDCTSMALLTAB_lut12($v0)
+ sll $t0, 17 # window <<= 12 + 5
+ addiu $t5, -17 # bit_offset -= 12 + 5
+ b .Lwrite_value
+ addiu $t7, 1 # coeff_index++
+
+ # Prefix 0000000000001 is not valid.
+ beqz $t0, .Lstop_processing
+ nop
+ jr $ra
+ li $v0, -1
+ .word 0, 0, 0, 0
+
+ # Prefix 00000000000001 is not valid.
+ beqz $t0, .Lstop_processing
+ nop
+ jr $ra
+ li $v0, -1
+ .word 0, 0, 0, 0
+
+ # Prefix 000000000000001 is not valid.
+ beqz $t0, .Lstop_processing
+ nop
+ jr $ra
+ li $v0, -1
+ .word 0, 0, 0, 0
+
+ # Prefix 0000000000000001 is not valid.
+ beqz $t0, .Lstop_processing
+ nop
+ jr $ra
+ li $v0, -1
+ #.word 0, 0, 0, 0
+
+.Lupdate_window_and_write:
+ sllv $t0, $t0, $v1 # window <<= length
+ subu $t5, $v1 # bit_offset -= length
+.Lwrite_value:
+ sh $v0, 0($a1)
+.Lfeed_bitstream:
+ # Update the window. This makes sure the next iteration of the loop will be
+ # able to read up to 32 bits from the bitstream.
+ bgez $t5, .Lskip_feeding # if (bit_offset < 0)
+ addiu $a2, -1 # max_size--
+
+ subu $v0, $0, $t5 # window = next_window << (-bit_offset)
+ sllv $t0, $t1, $v0
+ lw $t1, 0($a3) # next_window = (*input << 16) | (*input >> 16)
+ addiu $t5, 32 # bit_offset += 32
+ srl $v0, $t1, 16
+ sll $t1, 16
+ or $t1, $v0
+ addiu $a3, 4 # input++
+
+.Lskip_feeding:
+ srlv $v0, $t1, $t5 # window |= next_window >> bit_offset
+ or $t0, $v0
+
+ bnez $a2, .Lprocess_next_code_loop
+ addiu $a1, 2 # output++
+
+.Lstop_processing:
+ # If remaining = 0, skip flushing the context, pad the output buffer with
+ # end-of-block codes if necessary and return 0. Otherwise flush the context
+ # and return 1.
+ beqz $t2, .Lpad_output_buffer
+ nop
+
+ sw $a3, VLC_Context_input($a0)
+ sw $t0, VLC_Context_window($a0)
+ sw $t1, VLC_Context_next_window($a0)
+ sw $t2, VLC_Context_remaining($a0)
+ sh $t3, VLC_Context_quant_scale($a0)
+ sb $t4, VLC_Context_is_v3($a0)
+ sb $t5, VLC_Context_bit_offset($a0)
+ sb $t6, VLC_Context_block_index($a0)
+ sb $t7, VLC_Context_coeff_index($a0)
+
+ jr $ra
+ li $v0, 1
+
+.Lpad_output_buffer:
+ beqz $a2, .Lreturn_zero
+ li $v0, 0xfe00
+.Lpad_output_buffer_loop: # while (max_size)
+ sh $v0, 0($a1) # *output = 0xfe00
+ addiu $a2, -1 # max_size--
+ bnez $a2, .Lpad_output_buffer_loop
+ addiu $a1, 2 # output++
+
+.Lreturn_zero:
+ jr $ra
+ li $v0, 0
diff --git a/libpsn00b/psxpress/vlc2.c b/libpsn00b/psxpress/vlc2.c
new file mode 100644
index 0000000..73b54b2
--- /dev/null
+++ b/libpsn00b/psxpress/vlc2.c
@@ -0,0 +1,240 @@
+/*
+ * PSn00bSDK MDEC library (alternate VLC decompressor and support code)
+ * (C) 2022 spicyjpeg - MPL licensed
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <psxpress.h>
+
+#define _min(x, y) (((x) < (y)) ? (x) : (y))
+
+/* Huffman code lookup table */
+
+#define TABLE_LENGTH 226
+
+// This table is run-length compressed, with the number of repetitions of each
+// value stored in the upper 11 bits which would be otherwise unused. It is
+// decompressed at runtime by DecDCTvlcBuild().
+static const uint32_t _compressed_table[TABLE_LENGTH] = {
+ 0x03e00000, 0x000d000b, 0x000d03f5, 0x000d2002, 0x000d23fe, 0x000d1003,
+ 0x000d13fd, 0x000d000a, 0x000d03f6, 0x000d0804, 0x000d0bfc, 0x000d1c02,
+ 0x000d1ffe, 0x000d5402, 0x000d57fe, 0x000d5001, 0x000d53ff, 0x000d0009,
+ 0x000d03f7, 0x000d4c01, 0x000d4fff, 0x000d4801, 0x000d4bff, 0x000d0405,
+ 0x000d07fb, 0x000d0c03, 0x000d0ffd, 0x000d0008, 0x000d03f8, 0x000d1802,
+ 0x000d1bfe, 0x000d4401, 0x000d47ff, 0x006b4001, 0x006b43ff, 0x006b1402,
+ 0x006b17fe, 0x006b0007, 0x006b03f9, 0x006b0803, 0x006b0bfd, 0x006b0404,
+ 0x006b07fc, 0x006b3c01, 0x006b3fff, 0x006b3801, 0x006b3bff, 0x006b1002,
+ 0x006b13fe, 0x0fe00000, 0x03e80802, 0x03e80bfe, 0x03e82401, 0x03e827ff,
+ 0x03e80004, 0x03e803fc, 0x03e82001, 0x03e823ff, 0x07e71c01, 0x07e71fff,
+ 0x07e71801, 0x07e71bff, 0x07e70402, 0x07e707fe, 0x07e71401, 0x07e717ff,
+ 0x01e93401, 0x01e937ff, 0x01e90006, 0x01e903fa, 0x01e93001, 0x01e933ff,
+ 0x01e92c01, 0x01e92fff, 0x01e90c02, 0x01e90ffe, 0x01e90403, 0x01e907fd,
+ 0x01e90005, 0x01e903fb, 0x01e92801, 0x01e92bff, 0x0fe60003, 0x0fe603fd,
+ 0x0fe61001, 0x0fe613ff, 0x0fe60c01, 0x0fe60fff, 0x1fe50002, 0x1fe503fe,
+ 0x1fe50801, 0x1fe50bff, 0x3fe40401, 0x3fe407ff, 0xffe2fe00, 0x7fe30001,
+ 0x7fe303ff, 0x03e00000, 0x00110412, 0x001107ee, 0x00110411, 0x001107ef,
+ 0x00110410, 0x001107f0, 0x0011040f, 0x001107f1, 0x00111803, 0x00111bfd,
+ 0x00114002, 0x001143fe, 0x00113c02, 0x00113ffe, 0x00113802, 0x00113bfe,
+ 0x00113402, 0x001137fe, 0x00113002, 0x001133fe, 0x00112c02, 0x00112ffe,
+ 0x00117c01, 0x00117fff, 0x00117801, 0x00117bff, 0x00117401, 0x001177ff,
+ 0x00117001, 0x001173ff, 0x00116c01, 0x00116fff, 0x00300028, 0x003003d8,
+ 0x00300027, 0x003003d9, 0x00300026, 0x003003da, 0x00300025, 0x003003db,
+ 0x00300024, 0x003003dc, 0x00300023, 0x003003dd, 0x00300022, 0x003003de,
+ 0x00300021, 0x003003df, 0x00300020, 0x003003e0, 0x0030040e, 0x003007f2,
+ 0x0030040d, 0x003007f3, 0x0030040c, 0x003007f4, 0x0030040b, 0x003007f5,
+ 0x0030040a, 0x003007f6, 0x00300409, 0x003007f7, 0x00300408, 0x003007f8,
+ 0x006f001f, 0x006f03e1, 0x006f001e, 0x006f03e2, 0x006f001d, 0x006f03e3,
+ 0x006f001c, 0x006f03e4, 0x006f001b, 0x006f03e5, 0x006f001a, 0x006f03e6,
+ 0x006f0019, 0x006f03e7, 0x006f0018, 0x006f03e8, 0x006f0017, 0x006f03e9,
+ 0x006f0016, 0x006f03ea, 0x006f0015, 0x006f03eb, 0x006f0014, 0x006f03ec,
+ 0x006f0013, 0x006f03ed, 0x006f0012, 0x006f03ee, 0x006f0011, 0x006f03ef,
+ 0x006f0010, 0x006f03f0, 0x00ee2802, 0x00ee2bfe, 0x00ee2402, 0x00ee27fe,
+ 0x00ee1403, 0x00ee17fd, 0x00ee0c04, 0x00ee0ffc, 0x00ee0805, 0x00ee0bfb,
+ 0x00ee0407, 0x00ee07f9, 0x00ee0406, 0x00ee07fa, 0x00ee000f, 0x00ee03f1,
+ 0x00ee000e, 0x00ee03f2, 0x00ee000d, 0x00ee03f3, 0x00ee000c, 0x00ee03f4,
+ 0x00ee6801, 0x00ee6bff, 0x00ee6401, 0x00ee67ff, 0x00ee6001, 0x00ee63ff,
+ 0x00ee5c01, 0x00ee5fff, 0x00ee5801, 0x00ee5bff
+};
+
+/* Internal globals */
+
+// Note that DecDCTvlc() and DecDCTvlc2() do *not* share the same variables.
+static VLC_Context _default_context;
+static size_t _max_buffer_size = 0;
+
+const DECDCTTAB2 *_vlc_huffman_table2 = 0;
+
+/* VLC decoder */
+
+#define _get_bits_unsigned(length) (((uint32_t) window) >> (32 - (length)))
+#define _get_bits_signed(length) (((int32_t) window) >> (32 - (length)))
+#define _advance_window(num) \
+ window <<= (num); \
+ bit_offset -= (num);
+
+int __attribute__((optimize(3))) DecDCTvlcContinue2(
+ VLC_Context *ctx, uint32_t *buf, size_t max_size
+) {
+ const uint32_t *input = ctx->input;
+ uint32_t remaining = ctx->remaining;
+ uint32_t window = ctx->window;
+ uint32_t next_window = ctx->next_window;
+ uint16_t quant_scale = ctx->quant_scale;
+ int block_index = ctx->block_index;
+ int coeff_index = ctx->coeff_index;
+ int bit_offset = ctx->bit_offset;
+ int is_v3 = ctx->is_v3;
+
+ //if (!_vlc_huffman_table2)
+ //return -1;
+ if (!max_size)
+ max_size = 0x7fffffff;
+
+ // Write the length of the data that will be decoded to first 4 bytes of
+ // the output buffer, which will be then parsed by DecDCTin().
+ max_size = _min((max_size - 1) * 2, remaining);
+ remaining -= max_size;
+
+ *buf = 0x38000000 | (max_size / 2);
+ uint16_t *output = (uint16_t *) &buf[1];
+
+ for (; max_size; max_size--) {
+ uint32_t value;
+
+ if (coeff_index) {
+ // Parse the next AC coefficient. Most codes are decompressed via
+ // the lookup table, however some need special handling.
+ if ((window >> 30) == 0b10) {
+ // Prefix 10 marks the end of a block.
+ *output = 0xfe00;
+ _advance_window(2);
+
+ coeff_index = -1;
+ block_index++;
+ if (block_index > 5)
+ block_index = 0;
+ } else if ((window >> 26) == 0b000001) {
+ // Prefix 000001 is an escape code followed by a full 16-bit
+ // MDEC value.
+ *output = (uint16_t) _get_bits_unsigned(22);
+ _advance_window(22);
+ } else if (window >> 24) {
+ // The first lookup table is for codes that not start with
+ // 00000000.
+ value = _vlc_huffman_table2->lut[_get_bits_unsigned(13)];
+ _advance_window(value >> 16);
+ *output = (uint16_t) value;
+ } else {
+ // If the code starts with 00000000, use the second lookup
+ // table.
+ value = _vlc_huffman_table2->lut00[_get_bits_unsigned(17)];
+ _advance_window(value >> 16);
+ *output = (uint16_t) value;
+ }
+ } else {
+ // Parse the DC (first) coefficient for this block. Version 2
+ // simply stores the signed 10-bit value as-is, while version 3
+ // uses a delta encoding combined with a compression method similar
+ // to exp-Golomb.
+ if (is_v3) {
+ // TODO: version 3 is currently not supported.
+ return -1;
+ } else {
+ value = _get_bits_unsigned(10);
+ *output = value | quant_scale;
+ _advance_window(10);
+ }
+ }
+
+ output++;
+ coeff_index++;
+
+ // Update the bitstream window. For whatever reason Sony's DecDCTvlc()
+ // implementation inefficiently reads the input stream 16 bits at a
+ // time and processes each 16-bit word starting from the the MSB, so an
+ // endianness conversion is necessary to preserve bit order when
+ // reading 32 bits at a time. Also note that the PS1 CPU is not capable
+ // of shifting by more than 31 bits - it will shift by 0 bits instead!
+ if (bit_offset < 0) {
+ window = next_window << (-bit_offset);
+ bit_offset += 32;
+ next_window = (*input << 16) | (*input >> 16);
+ input++;
+ };
+ window |= next_window >> bit_offset;
+ }
+
+ // Pad the buffer with end-of-block codes if necessary.
+ for (; max_size; max_size--)
+ *(output++) = 0xfe00;
+
+ if (!remaining)
+ return 0;
+
+ ctx->input = input;
+ ctx->remaining = remaining;
+ ctx->window = window;
+ ctx->next_window = next_window;
+ ctx->block_index = block_index;
+ ctx->coeff_index = coeff_index;
+ ctx->bit_offset = bit_offset;
+ return 1;
+}
+
+int DecDCTvlcStart2(
+ VLC_Context *ctx, uint32_t *buf, size_t max_size, const uint32_t *bs
+) {
+ const BS_Header *header = (const BS_Header *) bs;
+ const uint32_t *input = (const uint32_t *) &header[1];
+
+ if (!_vlc_huffman_table2)
+ return -1;
+ if (header->version > 3)
+ return -1;
+
+ ctx->input = &input[2];
+ ctx->remaining = (header->mdec0_header & 0xffff) * 2;
+ ctx->window = (input[0] << 16) | (input[0] >> 16);
+ ctx->next_window = (input[1] << 16) | (input[1] >> 16);
+ ctx->quant_scale = (header->quant_scale & 63) << 10;
+ ctx->block_index = 0;
+ ctx->coeff_index = 0;
+ ctx->bit_offset = 32;
+ ctx->is_v3 = (header->version == 3);
+
+ return DecDCTvlcContinue2(ctx, buf, max_size);
+}
+
+/* Stateful VLC decoder API (for Sony SDK compatibility) */
+
+int DecDCTvlc2(const uint32_t *bs, uint32_t *buf, DECDCTTAB2 *table) {
+ if (table)
+ _vlc_huffman_table2 = table;
+
+ if (bs)
+ return DecDCTvlcStart2(&_default_context, buf, _max_buffer_size, bs);
+ else
+ return DecDCTvlcContinue2(&_default_context, buf, _max_buffer_size);
+}
+
+size_t DecDCTvlcSize2(size_t size) {
+ size_t old_size = _max_buffer_size;
+ _max_buffer_size = size;
+
+ return old_size;
+}
+
+/* Lookup table decompressor */
+
+void DecDCTvlcBuild(DECDCTTAB2 *table) {
+ uint32_t *output = (uint32_t *) table;
+ _vlc_huffman_table2 = table;
+
+ for (int i = 0; i < TABLE_LENGTH; i++) {
+ uint32_t value = _compressed_table[i] & 0x001fffff;
+
+ for (int j = (_compressed_table[i] >> 21); j >= 0; j--)
+ *(output++) = value;
+ }
+}