#!/usr/bin/env python3
# Simple MDEC image encoder (requires PIL/Pillow and NumPy to be installed)
# (C) 2022 spicyjpeg - MPL licensed

import math
from warnings import warn
from argparse import ArgumentParser, FileType

import numpy
from PIL import Image

LUMA_SCALE   = 8
CHROMA_SCALE = 16

## Tables

ZIGZAG_TABLE = numpy.array((
	 0,  1,  5,  6, 14, 15, 27, 28,
	 2,  4,  7, 13, 16, 26, 29, 42,
	 3,  8, 12, 17, 25, 30, 41, 43,
	 9, 11, 18, 24, 31, 40, 44, 53,
	10, 19, 23, 32, 39, 45, 52, 54,
	20, 22, 33, 38, 46, 51, 55, 60,
	21, 34, 37, 47, 50, 56, 59, 61,
	35, 36, 48, 49, 57, 58, 62, 63
), numpy.uint8).argsort()

# The default luma and chroma quantization table is based on the MPEG-1
# quantization table, with the only difference being the first value (2 instead
# of 8).
QUANT_TABLE = numpy.array((
	 2, 16, 19, 22, 26, 27, 29, 34,
	16, 16, 22, 24, 27, 29, 34, 37,
	19, 22, 26, 27, 29, 34, 34, 38,
	22, 22, 26, 27, 29, 34, 37, 40,
	22, 26, 27, 29, 32, 35, 40, 48,
	26, 27, 29, 32, 35, 40, 48, 58,
	26, 27, 29, 34, 38, 46, 56, 69,
	27, 29, 35, 38, 46, 56, 69, 83
), numpy.uint8).reshape(( 8, 8 ))

S = [ math.cos((i or 4) / 16 * math.pi) / 2 for i in range(8) ]

DCT_MATRIX = numpy.array((
	 S[0],  S[0],  S[0],  S[0],  S[0],  S[0],  S[0],  S[0],
	 S[1],  S[3],  S[5],  S[7], -S[7], -S[5], -S[3], -S[1],
	 S[2],  S[6], -S[6], -S[2], -S[2], -S[6],  S[6],  S[2],
	 S[3], -S[7], -S[1], -S[5],  S[5],  S[1],  S[7], -S[3],
	 S[4], -S[4], -S[4],  S[4],  S[4], -S[4], -S[4],  S[4],
	 S[5], -S[1],  S[7],  S[3], -S[3], -S[7],  S[1], -S[5],
	 S[6], -S[2],  S[2], -S[6], -S[6],  S[2], -S[2],  S[6],
	 S[7], -S[5],  S[3], -S[1],  S[1], -S[3],  S[5], -S[7]
), numpy.float32).reshape(( 8, 8 ))

## Helpers

def to_int10(value):
	clamped = min(max(int(value), -0x200), 0x1ff)

	return clamped + (0 if clamped >= 0 else 0x400)

def rgb_to_ycbcr_planar(image):
	scaled  = image.astype(numpy.float32) / 255.0
	r, g, b = scaled.transpose(( 2, 0, 1 ))

	# https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
	y  =  16 + r *  65.481 + g * 128.553 + b *  24.966
	cb = 128 - r *  37.797 - g *  74.203 + b * 112.000
	cr = 128 + r * 112.000 - g *  93.786 - b *  18.214

	return y, cb, cr

## Block encoder

def encode_block(buffer, block, scale):
	# Perform discrete cosine transform on the block, divide the coefficients by
	# the quantization table and reorder them in zigzag order.
	_block = block.astype(numpy.float32) - 128.0
	coeffs = (DCT_MATRIX @ _block @ DCT_MATRIX.T) / QUANT_TABLE
	coeffs = coeffs.reshape(( 64, ))[ZIGZAG_TABLE]

	buffer[0] = (scale << 10) | to_int10(round(coeffs[0]))
	offset    = 1

	# Divide the AC coefficients by the given quantization scale and encode them
	# as run-length pairs by counting how many zeroes there are between each
	# non-zero value.
	ac_values  = coeffs[1:] * 8.0 / scale
	encoded    = []
	run_length = 0

	for ac in ac_values.round().astype(numpy.int32):
		if ac:
			buffer[offset] = (run_length << 10) | to_int10(ac)
			offset += 1

			run_length  = 0
		else:
			run_length += 1

	# Flush any remaining zeroes.
	if run_length:
		buffer[offset] = (run_length - 1) << 10
		offset += 1

	# Add 1 or 2 end-of-block codes depending on whether the number of 16-bit
	# values output so far is odd or even. Some emulators will break if blocks
	# are not 32-bit aligned.
	buffer[offset] = 0xfe00
	offset += 1
	if offset % 2:
		buffer[offset] = 0xfe00
		offset += 1

	return offset

def encode_macroblock(buffer, block, y_scale, c_scale):
	#y, cb, cr = rgb_to_ycbcr_planar(block)
	y, cb, cr = block.transpose(( 2, 0, 1 ))
	offset    = 0

	# Split the macroblock into 6 monochrome 8x8 blocks (Cr, Cb at half
	# resolution + Y1-4). The MDEC uses 4:2:0 chroma subsampling.
	# TODO: use bilinear sampling instead of nearest-neighbor for chroma
	offset += encode_block(buffer[offset:], cr[0:16:2, 0:16:2], c_scale)
	offset += encode_block(buffer[offset:], cb[0:16:2, 0:16:2], c_scale)
	offset += encode_block(buffer[offset:], y[0: 8, 0: 8], y_scale)
	offset += encode_block(buffer[offset:], y[0: 8, 8:16], y_scale)
	offset += encode_block(buffer[offset:], y[8:16, 0: 8], y_scale)
	offset += encode_block(buffer[offset:], y[8:16, 8:16], y_scale)

	return offset

## Main

def get_args():
	parser = ArgumentParser(
		description = "Generates uncompressed MDEC bitstream data from an image."
	)
	parser.add_argument(
		"input_file",
		type = FileType("rb"),
		help = "input image file"
	)
	parser.add_argument(
		"-o", "--output",
		type    = FileType("wb"),
		default = "image.bin",
		help    = "where to output converted image data (image.bin by default)",
		metavar = "file"
	)
	parser.add_argument(
		"-m", "--monochrome",
		action = "store_true",
		help   = "encode image as monochrome (8x8 blocks) instead of color (16x16 macroblocks)"
	)
	parser.add_argument(
		"-y", "--luma",
		type    = int,
		default = LUMA_SCALE,
		help    = f"quantization scale for luma/monochrome blocks (0-63, default {LUMA_SCALE})",
		metavar = "scale"
	)
	parser.add_argument(
		"-c", "--chroma",
		type    = int,
		default = CHROMA_SCALE,
		help    = f"quantization scale for chroma blocks (0-63, default {CHROMA_SCALE})",
		metavar = "scale"
	)

	return parser.parse_args()

def main():
	args = get_args()
	if args.luma < 0 or args.luma > 63:
		raise ValueError("luma quantization scale must be in 0-63 range")
	if args.chroma < 0 or args.chroma > 63:
		raise ValueError("chroma quantization scale must be in 0-63 range")

	image = Image.open(args.input_file, "r")
	data  = numpy.array(image.convert("YCbCr"), numpy.uint8)
	size  = 8 if args.monochrome else 16

	if image.width % size:
		warn(RuntimeWarning(f"image width is not a multiple of {size}, trimming"))
	if image.height % size:
		warn(RuntimeWarning(f"image height is not a multiple of {size}, trimming"))

	# Preallocate 1 MB for the converted image data (faster than expanding an
	# array dynamically -- this script is too slow already).
	buffer = numpy.empty(0x80000, numpy.uint16)
	offset = 0

	# Split the image into 8x8 or 16x16 blocks and encode them in column-major
	# order.
	for x in range(0, image.width, size):
		for y in range(0, image.height, size):
			block = data[y:(y + size), x:(x + size)]

			if args.monochrome:
				offset += encode_block(buffer[offset:], block[:, :, 0], args.luma)
			else:
				offset += encode_macroblock(buffer[offset:], block, args.luma, args.chroma)

	# Pad the generated data to the size of a DMA chunk (32x 32-bit words or
	# 128 bytes).
	length = (offset + 63) & 0xffffffc0
	buffer[offset:length] = 0xfe00

	if length > (0xffff * 2):
		warn(RuntimeWarning("image is too large to be decoded with a single DecDCTin() call"))

	with args.output as _file:
		buffer[0:length].tofile(_file)

if __name__ == "__main__":
	main()