aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPetteri.Aimonen <Petteri.Aimonen@gmail.com>2012-02-27 16:40:45 +0000
committerPetteri.Aimonen <Petteri.Aimonen@gmail.com>2012-02-27 16:40:45 +0000
commit90973e833d69a93525ef2c4eb5ab687ee13342df (patch)
treec154fc48cca12a87b8a82f9892c53d593f5da592
parente929442f7113dd321057293b8addd7b6e781d77f (diff)
Benchmark suite using simulators
-rw-r--r--benchmarks/Makefile33
-rw-r--r--benchmarks/benchmark.c214
-rw-r--r--benchmarks/generate_testcases.py140
-rw-r--r--benchmarks/interface-arm.c32
-rw-r--r--benchmarks/interface-avr.c39
-rw-r--r--benchmarks/interface.h16
6 files changed, 474 insertions, 0 deletions
diff --git a/benchmarks/Makefile b/benchmarks/Makefile
new file mode 100644
index 0000000..efd450e
--- /dev/null
+++ b/benchmarks/Makefile
@@ -0,0 +1,33 @@
+# These are testcases & benchmarks for the library on the target processors
+# (currently ARM Cortex M3 and AVR). They are a bit tricky to run, as they
+# depend on specific simulator versions.
+
+FILES = benchmark.c ../libfixmath/fix16.c ../libfixmath/fix16_sqrt.c ../libfixmath/fix16_exp.c
+
+CFLAGS = -DFIXMATH_NO_OVERFLOW -DFIXMATH_NO_ROUNDING -ffast-math -I../libfixmath
+
+testcases.c: generate_testcases.py
+ python $<
+
+benchmark-arm.elf: $(FILES) interface-arm.c testcases.c
+ # Note: this needs hacked QEmu that "makes no sense":
+ # https://bugs.launchpad.net/qemu/+bug/696094
+ arm-none-eabi-gcc -mcpu=cortex-m3 -mthumb -T generic-m-hosted.ld \
+ -Wall -O2 $(CFLAGS) \
+ -o $@ -I .. $(FILES) interface-arm.c -lm
+
+run-benchmark-arm: benchmark-arm.elf
+ qemu-system-arm -cpu cortex-m3 -icount 0 -device armv7m_nvic \
+ -nographic -monitor null -serial null \
+ -semihosting -kernel $<
+
+benchmark-avr.elf: $(FILES) interface-avr.c testcases.c
+ avr-gcc -Wall -mmcu=atmega128 $(CFLAGS) \
+ -Wall -O2 -DFIXMATH_OPTIMIZE_8BIT \
+ -o $@ -I .. $(FILES) interface-avr.c
+
+run-benchmark-avr: benchmark-avr.elf
+ # Note: this needs simulavrxx 1.0rc0 or newer
+ simulavr -d atmega128 -f $< -W 0x20,- -T exit
+
+
diff --git a/benchmarks/benchmark.c b/benchmarks/benchmark.c
new file mode 100644
index 0000000..de83388
--- /dev/null
+++ b/benchmarks/benchmark.c
@@ -0,0 +1,214 @@
+#ifndef NO_FLOAT
+#include <math.h>
+#endif
+
+#include <fix16.h>
+#include "interface.h"
+#include <stdio.h>
+
+/* Autogenerated testcases */
+#include "testcases.c"
+
+/* Tools for profiling */
+
+typedef struct {
+ uint32_t min;
+ uint32_t max;
+ uint32_t sum;
+ uint32_t count;
+} cyclecount_t;
+
+// Initializer for cyclecount_t structure.
+// Max is initialized to 0 and min is 2^32-1 so that first call to cyclecount_update will set them.
+#define CYCLECOUNT_INIT {0xFFFFFFFF, 0, 0, 0}
+
+// Update cyclecount_t structure after a single measurement has been made.
+static void cyclecount_update(cyclecount_t *data, uint32_t cycles)
+{
+ if (cycles < data->min)
+ data->min = cycles;
+ if (cycles > data->max)
+ data->max = cycles;
+
+ data->sum += cycles;
+ data->count++;
+}
+
+#define MEASURE(variable, statement) { \
+ start_timing(); \
+ statement; \
+ cyclecount_update(&variable, end_timing()); \
+}
+
+#define PRINT(variable, label) { \
+ print_value(label " min", variable.min); \
+ print_value(label " max", variable.max); \
+ print_value(label " avg", variable.sum / variable.count); \
+}
+
+static cyclecount_t exp_cycles = CYCLECOUNT_INIT;
+static cyclecount_t sqrt_cycles = CYCLECOUNT_INIT;
+static cyclecount_t add_cycles = CYCLECOUNT_INIT;
+static cyclecount_t sub_cycles = CYCLECOUNT_INIT;
+static cyclecount_t div_cycles = CYCLECOUNT_INIT;
+static cyclecount_t mul_cycles = CYCLECOUNT_INIT;
+
+#ifndef NO_FLOAT
+static cyclecount_t float_sqrtf_cycles = CYCLECOUNT_INIT;
+static cyclecount_t float_add_cycles = CYCLECOUNT_INIT;
+static cyclecount_t float_sub_cycles = CYCLECOUNT_INIT;
+static cyclecount_t float_div_cycles = CYCLECOUNT_INIT;
+static cyclecount_t float_mul_cycles = CYCLECOUNT_INIT;
+#endif
+
+static fix16_t delta(fix16_t result, fix16_t expected)
+{
+#ifdef FIXMATH_NO_OVERFLOW
+ // Ignore overflow errors when the detection is turned off
+ if (expected == fix16_min)
+ return 0;
+#endif
+
+ if (result >= expected)
+ {
+ return result - expected;
+ }
+ else
+ {
+ return expected - result;
+ }
+}
+
+#ifdef FIXMATH_NO_ROUNDING
+const fix16_t max_delta = 1;
+#else
+const fix16_t max_delta = 0;
+#endif
+
+int main()
+{
+ int i;
+ interface_init();
+
+ start_timing();
+ print_value("Timestamp bias", end_timing());
+
+ for (i = 0; i < TESTCASES1_COUNT; i++)
+ {
+ fix16_t input = testcases1[i].a;
+ fix16_t result;
+ fix16_t expected = testcases1[i].sqrt;
+ MEASURE(sqrt_cycles, result = fix16_sqrt(input));
+
+ if (input > 0 && delta(result, expected) > max_delta)
+ {
+ print_value("Failed SQRT, i", i);
+ print_value("Failed SQRT, input", input);
+ print_value("Failed SQRT, output", result);
+ print_value("Failed SQRT, expected", expected);
+ }
+
+ expected = testcases1[i].exp;
+ MEASURE(exp_cycles, result = fix16_exp(input));
+
+ if (delta(result, expected) > 400)
+ {
+ print_value("Failed EXP, i", i);
+ print_value("Failed EXP, input", input);
+ print_value("Failed EXP, output", result);
+ print_value("Failed EXP, expected", expected);
+ }
+ }
+ PRINT(sqrt_cycles, "fix16_sqrt");
+ PRINT(exp_cycles, "fix16_exp");
+
+ for (i = 0; i < TESTCASES2_COUNT; i++)
+ {
+ fix16_t a = testcases2[i].a;
+ fix16_t b = testcases2[i].b;
+ volatile fix16_t result;
+
+ fix16_t expected = testcases2[i].add;
+ MEASURE(add_cycles, result = fix16_add(a, b));
+ if (delta(result, expected) > max_delta)
+ {
+ print_value("Failed ADD, i", i);
+ print_value("Failed ADD, a", a);
+ print_value("Failed ADD, b", b);
+ print_value("Failed ADD, output", result);
+ print_value("Failed ADD, expected", expected);
+ }
+
+ expected = testcases2[i].sub;
+ MEASURE(sub_cycles, result = fix16_sub(a, b));
+ if (delta(result, expected) > max_delta)
+ {
+ print_value("Failed SUB, i", i);
+ print_value("Failed SUB, a", a);
+ print_value("Failed SUB, b", b);
+ print_value("Failed SUB, output", result);
+ print_value("Failed SUB, expected", expected);
+ }
+
+ expected = testcases2[i].mul;
+ MEASURE(mul_cycles, result = fix16_mul(a, b));
+ if (delta(result, expected) > max_delta)
+ {
+ print_value("Failed MUL, i", i);
+ print_value("Failed MUL, a", a);
+ print_value("Failed MUL, b", b);
+ print_value("Failed MUL, output", result);
+ print_value("Failed MUL, expected", expected);
+ }
+
+ if (b != 0)
+ {
+ expected = testcases2[i].div;
+ MEASURE(div_cycles, result = fix16_div(a, b));
+ if (delta(result, expected) > max_delta)
+ {
+ print_value("Failed DIV, i", i);
+ print_value("Failed DIV, a", a);
+ print_value("Failed DIV, b", b);
+ print_value("Failed DIV, output", result);
+ print_value("Failed DIV, expected", expected);
+ }
+ }
+ }
+ PRINT(add_cycles, "fix16_add");
+ PRINT(sub_cycles, "fix16_sub");
+ PRINT(mul_cycles, "fix16_mul");
+ PRINT(div_cycles, "fix16_div");
+
+ /* Compare with floating point performance */
+#ifndef NO_FLOAT
+ for (i = 0; i < TESTCASES1_COUNT; i++)
+ {
+ float input = fix16_to_float(testcases1[i].a);
+ volatile float result;
+ MEASURE(float_sqrtf_cycles, result = sqrtf(input));
+ }
+ PRINT(float_sqrtf_cycles, "float sqrtf");
+
+ for (i = 0; i < TESTCASES2_COUNT; i++)
+ {
+ float a = fix16_to_float(testcases2[i].a);
+ float b = fix16_to_float(testcases2[i].b);
+ volatile float result;
+ MEASURE(float_add_cycles, result = a + b);
+ MEASURE(float_sub_cycles, result = a - b);
+ MEASURE(float_mul_cycles, result = a * b);
+
+ if (b != 0)
+ {
+ MEASURE(float_div_cycles, result = a / b);
+ }
+ }
+ PRINT(float_add_cycles, "float add");
+ PRINT(float_sub_cycles, "float sub");
+ PRINT(float_mul_cycles, "float mul");
+ PRINT(float_div_cycles, "float div");
+#endif
+
+ return 0;
+}
diff --git a/benchmarks/generate_testcases.py b/benchmarks/generate_testcases.py
new file mode 100644
index 0000000..560ba4d
--- /dev/null
+++ b/benchmarks/generate_testcases.py
@@ -0,0 +1,140 @@
+'''This script precalculates the correct solutions for a set of test numbers,
+and writes them to testcases.c. This is aimed for running the tests on-target,
+therefore it doesn't test all the cases or use floating point math, but
+instead generates a ~10k binary.
+
+The tests are chosen randomly, so there is quite good chance to eventually
+catch most errors. Because the list is not regenerated automatically, the
+functioning of the benchmark application is still deterministic and easy
+to debug.
+'''
+
+import math
+import random
+import struct
+
+# Fix16 scaling factor
+scale = 65536.
+
+# Fix16 overflow indicator
+overflow = -2**31
+
+def f16_to_float(val):
+ return val / scale
+
+def float_to_f16(val):
+ val = int(round(val * scale))
+ if val >= 2**31 or val < -2**31:
+ val = overflow
+ return val
+
+def to_ui32(val):
+ return struct.unpack('I', struct.pack('i', val))[0]
+
+testcases = [
+ # Small numbers
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+ -1, -2, -3, -4, -5, -6, -7, -8, -9, -10,
+
+ # Integer numbers
+ 0x10000, -0x10000, 0x20000, -0x20000, 0x30000, -0x30000,
+ 0x40000, -0x40000, 0x50000, -0x50000, 0x60000, -0x60000,
+
+ # Fractions (1/2, 1/4, 1/8)
+ 0x8000, -0x8000, 0x4000, -0x4000, 0x2000, -0x2000,
+
+ # Problematic carry
+ 0xFFFF, -0xFFFF, 0x1FFFF, -0x1FFFF, 0x3FFFF, -0x3FFFF,
+
+ # Smallest and largest values
+ 0x7FFFFFFF, -0x80000000
+]
+
+for i in range(10):
+ # Large random numbers
+ testcases.append(random.randint(-0x80000000, 0x7FFFFFFF))
+
+ # Small random numbers
+ testcases.append(random.randint(-100000, 100000))
+
+ # Tiny random numbers
+ testcases.append(random.randint(-200, 200))
+
+out = open("testcases.c", "w")
+
+out.write('''
+/* Automatically generated testcases for fix16 operations
+ * See generate_testcases.py for the generator.
+ */
+
+#include <fix16.h>
+
+typedef struct {
+ // Input
+ fix16_t a;
+
+ // Correct output
+ fix16_t sqrt;
+ fix16_t exp;
+} fix16_1op_testcase;
+
+typedef struct {
+ // Inputs
+ fix16_t a;
+ fix16_t b;
+
+ // Correct output
+ fix16_t add;
+ fix16_t sub;
+ fix16_t mul;
+ fix16_t div;
+} fix16_2op_testcase;
+
+#define TESTCASES1_COUNT (sizeof(testcases1)/sizeof(testcases1[0]))
+#define TESTCASES2_COUNT (sizeof(testcases2)/sizeof(testcases2[0]))
+
+''')
+
+# Write testcases for 1-operand functions
+
+out.write('static const fix16_1op_testcase testcases1[] = {\n')
+
+for i in range(10):
+ a = random.choice(testcases)
+ if a >= 0:
+ sqrt = float_to_f16(math.sqrt(f16_to_float(a)))
+ else:
+ sqrt = 0
+
+ try:
+ exp = float_to_f16(math.exp(f16_to_float(a)))
+ except OverflowError:
+ exp = 0x7FFFFFFF
+
+ out.write(' {0x%08x, 0x%08x, 0x%08x}, // %d\n'
+ % (to_ui32(a), to_ui32(sqrt), to_ui32(exp), i))
+
+out.write('};\n\n')
+
+# Write testcases for 2-operand functions
+
+out.write('static const fix16_2op_testcase testcases2[] = {\n')
+
+for i in range(50):
+ a = random.choice(testcases)
+ b = random.choice(testcases)
+
+ add = float_to_f16(f16_to_float(a) + f16_to_float(b))
+ sub = float_to_f16(f16_to_float(a) - f16_to_float(b))
+ mul = float_to_f16(f16_to_float(a) * f16_to_float(b))
+ if b != 0:
+ div = float_to_f16(f16_to_float(a) / f16_to_float(b))
+ else:
+ div = 0
+ out.write(' {0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x}, // %d\n'
+ % (to_ui32(a), to_ui32(b), to_ui32(add), to_ui32(sub), to_ui32(mul), to_ui32(div), i))
+
+out.write('};\n\n')
+
+out.close()
+
diff --git a/benchmarks/interface-arm.c b/benchmarks/interface-arm.c
new file mode 100644
index 0000000..cd37979
--- /dev/null
+++ b/benchmarks/interface-arm.c
@@ -0,0 +1,32 @@
+#include "interface.h"
+#include <stdint.h>
+#include <stdio.h>
+
+// This targets an ARM Cortex M3 core using QEmu LM3S6965 emulation.
+#define STBASE 0xE000E000
+#define STCTRL (*(volatile uint32_t*)(0x010 + STBASE))
+#define STRELOAD (*(volatile uint32_t*)(0x014 + STBASE))
+#define STCURRENT (*(volatile uint32_t*)(0x018 + STBASE))
+
+
+void interface_init()
+{
+ STRELOAD = 0x00FFFFFF;
+ STCTRL = 5;
+}
+
+void start_timing()
+{
+ STCURRENT = 0;
+}
+
+uint16_t end_timing()
+{
+ return 0x00FFFFFF - STCURRENT - 4;
+}
+
+void print_value(const char *label, int32_t value)
+{
+ printf("%-20s %ld\n", label, value);
+}
+
diff --git a/benchmarks/interface-avr.c b/benchmarks/interface-avr.c
new file mode 100644
index 0000000..02731aa
--- /dev/null
+++ b/benchmarks/interface-avr.c
@@ -0,0 +1,39 @@
+#include <avr/io.h>
+#include <stdio.h>
+#include "interface.h"
+#include <stdint.h>
+
+#define special_output_port (*((volatile char *)0x20))
+static int output_char(char c, FILE *stream)
+{
+ special_output_port = c;
+ return 0;
+}
+
+static FILE mystdout = FDEV_SETUP_STREAM(output_char, NULL, _FDEV_SETUP_WRITE);
+
+void interface_init()
+{
+ // Set timer 1 to count cycles
+ TCCR1B = 1;
+
+ // Set output to simulator
+ stdout = &mystdout;
+ stderr = &mystdout;
+}
+
+
+void start_timing()
+{
+ TCNT1 = 0;
+}
+
+uint16_t end_timing()
+{
+ return TCNT1 - 9;
+}
+
+void print_value(const char *label, int32_t value)
+{
+ printf("%-20s %ld\n", label, value);
+}
diff --git a/benchmarks/interface.h b/benchmarks/interface.h
new file mode 100644
index 0000000..f8c5117
--- /dev/null
+++ b/benchmarks/interface.h
@@ -0,0 +1,16 @@
+// This file defines the hardware or simulator interface that will be used to
+// measure timings and report results.
+
+#include <stdint.h>
+
+// Initialize
+void interface_init();
+
+// Reset timer/counter/something
+void start_timing();
+
+// Return the number of clock cycles passed since start_timing();
+uint16_t end_timing();
+
+// Print a value to console, along with a descriptive label
+void print_value(const char *label, int32_t value);