diff --git a/benchmarks/Makefile b/benchmarks/Makefile new file mode 100644 index 0000000..efd450e --- /dev/null +++ b/benchmarks/Makefile @@ -0,0 +1,33 @@ +# These are testcases & benchmarks for the library on the target processors +# (currently ARM Cortex M3 and AVR). They are a bit tricky to run, as they +# depend on specific simulator versions. + +FILES = benchmark.c ../libfixmath/fix16.c ../libfixmath/fix16_sqrt.c ../libfixmath/fix16_exp.c + +CFLAGS = -DFIXMATH_NO_OVERFLOW -DFIXMATH_NO_ROUNDING -ffast-math -I../libfixmath + +testcases.c: generate_testcases.py + python $< + +benchmark-arm.elf: $(FILES) interface-arm.c testcases.c + # Note: this needs hacked QEmu that "makes no sense": + # https://bugs.launchpad.net/qemu/+bug/696094 + arm-none-eabi-gcc -mcpu=cortex-m3 -mthumb -T generic-m-hosted.ld \ + -Wall -O2 $(CFLAGS) \ + -o $@ -I .. $(FILES) interface-arm.c -lm + +run-benchmark-arm: benchmark-arm.elf + qemu-system-arm -cpu cortex-m3 -icount 0 -device armv7m_nvic \ + -nographic -monitor null -serial null \ + -semihosting -kernel $< + +benchmark-avr.elf: $(FILES) interface-avr.c testcases.c + avr-gcc -Wall -mmcu=atmega128 $(CFLAGS) \ + -Wall -O2 -DFIXMATH_OPTIMIZE_8BIT \ + -o $@ -I .. $(FILES) interface-avr.c + +run-benchmark-avr: benchmark-avr.elf + # Note: this needs simulavrxx 1.0rc0 or newer + simulavr -d atmega128 -f $< -W 0x20,- -T exit + + diff --git a/benchmarks/benchmark.c b/benchmarks/benchmark.c new file mode 100644 index 0000000..de83388 --- /dev/null +++ b/benchmarks/benchmark.c @@ -0,0 +1,214 @@ +#ifndef NO_FLOAT +#include +#endif + +#include +#include "interface.h" +#include + +/* Autogenerated testcases */ +#include "testcases.c" + +/* Tools for profiling */ + +typedef struct { + uint32_t min; + uint32_t max; + uint32_t sum; + uint32_t count; +} cyclecount_t; + +// Initializer for cyclecount_t structure. +// Max is initialized to 0 and min is 2^32-1 so that first call to cyclecount_update will set them. +#define CYCLECOUNT_INIT {0xFFFFFFFF, 0, 0, 0} + +// Update cyclecount_t structure after a single measurement has been made. +static void cyclecount_update(cyclecount_t *data, uint32_t cycles) +{ + if (cycles < data->min) + data->min = cycles; + if (cycles > data->max) + data->max = cycles; + + data->sum += cycles; + data->count++; +} + +#define MEASURE(variable, statement) { \ + start_timing(); \ + statement; \ + cyclecount_update(&variable, end_timing()); \ +} + +#define PRINT(variable, label) { \ + print_value(label " min", variable.min); \ + print_value(label " max", variable.max); \ + print_value(label " avg", variable.sum / variable.count); \ +} + +static cyclecount_t exp_cycles = CYCLECOUNT_INIT; +static cyclecount_t sqrt_cycles = CYCLECOUNT_INIT; +static cyclecount_t add_cycles = CYCLECOUNT_INIT; +static cyclecount_t sub_cycles = CYCLECOUNT_INIT; +static cyclecount_t div_cycles = CYCLECOUNT_INIT; +static cyclecount_t mul_cycles = CYCLECOUNT_INIT; + +#ifndef NO_FLOAT +static cyclecount_t float_sqrtf_cycles = CYCLECOUNT_INIT; +static cyclecount_t float_add_cycles = CYCLECOUNT_INIT; +static cyclecount_t float_sub_cycles = CYCLECOUNT_INIT; +static cyclecount_t float_div_cycles = CYCLECOUNT_INIT; +static cyclecount_t float_mul_cycles = CYCLECOUNT_INIT; +#endif + +static fix16_t delta(fix16_t result, fix16_t expected) +{ +#ifdef FIXMATH_NO_OVERFLOW + // Ignore overflow errors when the detection is turned off + if (expected == fix16_min) + return 0; +#endif + + if (result >= expected) + { + return result - expected; + } + else + { + return expected - result; + } +} + +#ifdef FIXMATH_NO_ROUNDING +const fix16_t max_delta = 1; +#else +const fix16_t max_delta = 0; +#endif + +int main() +{ + int i; + interface_init(); + + start_timing(); + print_value("Timestamp bias", end_timing()); + + for (i = 0; i < TESTCASES1_COUNT; i++) + { + fix16_t input = testcases1[i].a; + fix16_t result; + fix16_t expected = testcases1[i].sqrt; + MEASURE(sqrt_cycles, result = fix16_sqrt(input)); + + if (input > 0 && delta(result, expected) > max_delta) + { + print_value("Failed SQRT, i", i); + print_value("Failed SQRT, input", input); + print_value("Failed SQRT, output", result); + print_value("Failed SQRT, expected", expected); + } + + expected = testcases1[i].exp; + MEASURE(exp_cycles, result = fix16_exp(input)); + + if (delta(result, expected) > 400) + { + print_value("Failed EXP, i", i); + print_value("Failed EXP, input", input); + print_value("Failed EXP, output", result); + print_value("Failed EXP, expected", expected); + } + } + PRINT(sqrt_cycles, "fix16_sqrt"); + PRINT(exp_cycles, "fix16_exp"); + + for (i = 0; i < TESTCASES2_COUNT; i++) + { + fix16_t a = testcases2[i].a; + fix16_t b = testcases2[i].b; + volatile fix16_t result; + + fix16_t expected = testcases2[i].add; + MEASURE(add_cycles, result = fix16_add(a, b)); + if (delta(result, expected) > max_delta) + { + print_value("Failed ADD, i", i); + print_value("Failed ADD, a", a); + print_value("Failed ADD, b", b); + print_value("Failed ADD, output", result); + print_value("Failed ADD, expected", expected); + } + + expected = testcases2[i].sub; + MEASURE(sub_cycles, result = fix16_sub(a, b)); + if (delta(result, expected) > max_delta) + { + print_value("Failed SUB, i", i); + print_value("Failed SUB, a", a); + print_value("Failed SUB, b", b); + print_value("Failed SUB, output", result); + print_value("Failed SUB, expected", expected); + } + + expected = testcases2[i].mul; + MEASURE(mul_cycles, result = fix16_mul(a, b)); + if (delta(result, expected) > max_delta) + { + print_value("Failed MUL, i", i); + print_value("Failed MUL, a", a); + print_value("Failed MUL, b", b); + print_value("Failed MUL, output", result); + print_value("Failed MUL, expected", expected); + } + + if (b != 0) + { + expected = testcases2[i].div; + MEASURE(div_cycles, result = fix16_div(a, b)); + if (delta(result, expected) > max_delta) + { + print_value("Failed DIV, i", i); + print_value("Failed DIV, a", a); + print_value("Failed DIV, b", b); + print_value("Failed DIV, output", result); + print_value("Failed DIV, expected", expected); + } + } + } + PRINT(add_cycles, "fix16_add"); + PRINT(sub_cycles, "fix16_sub"); + PRINT(mul_cycles, "fix16_mul"); + PRINT(div_cycles, "fix16_div"); + + /* Compare with floating point performance */ +#ifndef NO_FLOAT + for (i = 0; i < TESTCASES1_COUNT; i++) + { + float input = fix16_to_float(testcases1[i].a); + volatile float result; + MEASURE(float_sqrtf_cycles, result = sqrtf(input)); + } + PRINT(float_sqrtf_cycles, "float sqrtf"); + + for (i = 0; i < TESTCASES2_COUNT; i++) + { + float a = fix16_to_float(testcases2[i].a); + float b = fix16_to_float(testcases2[i].b); + volatile float result; + MEASURE(float_add_cycles, result = a + b); + MEASURE(float_sub_cycles, result = a - b); + MEASURE(float_mul_cycles, result = a * b); + + if (b != 0) + { + MEASURE(float_div_cycles, result = a / b); + } + } + PRINT(float_add_cycles, "float add"); + PRINT(float_sub_cycles, "float sub"); + PRINT(float_mul_cycles, "float mul"); + PRINT(float_div_cycles, "float div"); +#endif + + return 0; +} diff --git a/benchmarks/generate_testcases.py b/benchmarks/generate_testcases.py new file mode 100644 index 0000000..560ba4d --- /dev/null +++ b/benchmarks/generate_testcases.py @@ -0,0 +1,140 @@ +'''This script precalculates the correct solutions for a set of test numbers, +and writes them to testcases.c. This is aimed for running the tests on-target, +therefore it doesn't test all the cases or use floating point math, but +instead generates a ~10k binary. + +The tests are chosen randomly, so there is quite good chance to eventually +catch most errors. Because the list is not regenerated automatically, the +functioning of the benchmark application is still deterministic and easy +to debug. +''' + +import math +import random +import struct + +# Fix16 scaling factor +scale = 65536. + +# Fix16 overflow indicator +overflow = -2**31 + +def f16_to_float(val): + return val / scale + +def float_to_f16(val): + val = int(round(val * scale)) + if val >= 2**31 or val < -2**31: + val = overflow + return val + +def to_ui32(val): + return struct.unpack('I', struct.pack('i', val))[0] + +testcases = [ + # Small numbers + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, + + # Integer numbers + 0x10000, -0x10000, 0x20000, -0x20000, 0x30000, -0x30000, + 0x40000, -0x40000, 0x50000, -0x50000, 0x60000, -0x60000, + + # Fractions (1/2, 1/4, 1/8) + 0x8000, -0x8000, 0x4000, -0x4000, 0x2000, -0x2000, + + # Problematic carry + 0xFFFF, -0xFFFF, 0x1FFFF, -0x1FFFF, 0x3FFFF, -0x3FFFF, + + # Smallest and largest values + 0x7FFFFFFF, -0x80000000 +] + +for i in range(10): + # Large random numbers + testcases.append(random.randint(-0x80000000, 0x7FFFFFFF)) + + # Small random numbers + testcases.append(random.randint(-100000, 100000)) + + # Tiny random numbers + testcases.append(random.randint(-200, 200)) + +out = open("testcases.c", "w") + +out.write(''' +/* Automatically generated testcases for fix16 operations + * See generate_testcases.py for the generator. + */ + +#include + +typedef struct { + // Input + fix16_t a; + + // Correct output + fix16_t sqrt; + fix16_t exp; +} fix16_1op_testcase; + +typedef struct { + // Inputs + fix16_t a; + fix16_t b; + + // Correct output + fix16_t add; + fix16_t sub; + fix16_t mul; + fix16_t div; +} fix16_2op_testcase; + +#define TESTCASES1_COUNT (sizeof(testcases1)/sizeof(testcases1[0])) +#define TESTCASES2_COUNT (sizeof(testcases2)/sizeof(testcases2[0])) + +''') + +# Write testcases for 1-operand functions + +out.write('static const fix16_1op_testcase testcases1[] = {\n') + +for i in range(10): + a = random.choice(testcases) + if a >= 0: + sqrt = float_to_f16(math.sqrt(f16_to_float(a))) + else: + sqrt = 0 + + try: + exp = float_to_f16(math.exp(f16_to_float(a))) + except OverflowError: + exp = 0x7FFFFFFF + + out.write(' {0x%08x, 0x%08x, 0x%08x}, // %d\n' + % (to_ui32(a), to_ui32(sqrt), to_ui32(exp), i)) + +out.write('};\n\n') + +# Write testcases for 2-operand functions + +out.write('static const fix16_2op_testcase testcases2[] = {\n') + +for i in range(50): + a = random.choice(testcases) + b = random.choice(testcases) + + add = float_to_f16(f16_to_float(a) + f16_to_float(b)) + sub = float_to_f16(f16_to_float(a) - f16_to_float(b)) + mul = float_to_f16(f16_to_float(a) * f16_to_float(b)) + if b != 0: + div = float_to_f16(f16_to_float(a) / f16_to_float(b)) + else: + div = 0 + out.write(' {0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x}, // %d\n' + % (to_ui32(a), to_ui32(b), to_ui32(add), to_ui32(sub), to_ui32(mul), to_ui32(div), i)) + +out.write('};\n\n') + +out.close() + diff --git a/benchmarks/interface-arm.c b/benchmarks/interface-arm.c new file mode 100644 index 0000000..cd37979 --- /dev/null +++ b/benchmarks/interface-arm.c @@ -0,0 +1,32 @@ +#include "interface.h" +#include +#include + +// This targets an ARM Cortex M3 core using QEmu LM3S6965 emulation. +#define STBASE 0xE000E000 +#define STCTRL (*(volatile uint32_t*)(0x010 + STBASE)) +#define STRELOAD (*(volatile uint32_t*)(0x014 + STBASE)) +#define STCURRENT (*(volatile uint32_t*)(0x018 + STBASE)) + + +void interface_init() +{ + STRELOAD = 0x00FFFFFF; + STCTRL = 5; +} + +void start_timing() +{ + STCURRENT = 0; +} + +uint16_t end_timing() +{ + return 0x00FFFFFF - STCURRENT - 4; +} + +void print_value(const char *label, int32_t value) +{ + printf("%-20s %ld\n", label, value); +} + diff --git a/benchmarks/interface-avr.c b/benchmarks/interface-avr.c new file mode 100644 index 0000000..02731aa --- /dev/null +++ b/benchmarks/interface-avr.c @@ -0,0 +1,39 @@ +#include +#include +#include "interface.h" +#include + +#define special_output_port (*((volatile char *)0x20)) +static int output_char(char c, FILE *stream) +{ + special_output_port = c; + return 0; +} + +static FILE mystdout = FDEV_SETUP_STREAM(output_char, NULL, _FDEV_SETUP_WRITE); + +void interface_init() +{ + // Set timer 1 to count cycles + TCCR1B = 1; + + // Set output to simulator + stdout = &mystdout; + stderr = &mystdout; +} + + +void start_timing() +{ + TCNT1 = 0; +} + +uint16_t end_timing() +{ + return TCNT1 - 9; +} + +void print_value(const char *label, int32_t value) +{ + printf("%-20s %ld\n", label, value); +} diff --git a/benchmarks/interface.h b/benchmarks/interface.h new file mode 100644 index 0000000..f8c5117 --- /dev/null +++ b/benchmarks/interface.h @@ -0,0 +1,16 @@ +// This file defines the hardware or simulator interface that will be used to +// measure timings and report results. + +#include + +// Initialize +void interface_init(); + +// Reset timer/counter/something +void start_timing(); + +// Return the number of clock cycles passed since start_timing(); +uint16_t end_timing(); + +// Print a value to console, along with a descriptive label +void print_value(const char *label, int32_t value);