Benchmark suite using simulators

This commit is contained in:
Petteri.Aimonen 2012-02-27 16:40:45 +00:00
parent e929442f71
commit 90973e833d
6 changed files with 474 additions and 0 deletions

benchmarks/Makefile Normal file
View File

@ -0,0 +1,33 @@
# These are testcases & benchmarks for the library on the target processors
# (currently ARM Cortex M3 and AVR). They are a bit tricky to run, as they
# depend on specific simulator versions.
FILES = benchmark.c ../libfixmath/fix16.c ../libfixmath/fix16_sqrt.c ../libfixmath/fix16_exp.c
python $<
benchmark-arm.elf: $(FILES) interface-arm.c testcases.c
# Note: this needs hacked QEmu that "makes no sense":
arm-none-eabi-gcc -mcpu=cortex-m3 -mthumb -T generic-m-hosted.ld \
-Wall -O2 $(CFLAGS) \
-o $@ -I .. $(FILES) interface-arm.c -lm
run-benchmark-arm: benchmark-arm.elf
qemu-system-arm -cpu cortex-m3 -icount 0 -device armv7m_nvic \
-nographic -monitor null -serial null \
-semihosting -kernel $<
benchmark-avr.elf: $(FILES) interface-avr.c testcases.c
avr-gcc -Wall -mmcu=atmega128 $(CFLAGS) \
-o $@ -I .. $(FILES) interface-avr.c
run-benchmark-avr: benchmark-avr.elf
# Note: this needs simulavrxx 1.0rc0 or newer
simulavr -d atmega128 -f $< -W 0x20,- -T exit

benchmarks/benchmark.c Normal file
View File

@ -0,0 +1,214 @@
#ifndef NO_FLOAT
#include <math.h>
#include <fix16.h>
#include "interface.h"
#include <stdio.h>
/* Autogenerated testcases */
#include "testcases.c"
/* Tools for profiling */
typedef struct {
uint32_t min;
uint32_t max;
uint32_t sum;
uint32_t count;
} cyclecount_t;
// Initializer for cyclecount_t structure.
// Max is initialized to 0 and min is 2^32-1 so that first call to cyclecount_update will set them.
#define CYCLECOUNT_INIT {0xFFFFFFFF, 0, 0, 0}
// Update cyclecount_t structure after a single measurement has been made.
static void cyclecount_update(cyclecount_t *data, uint32_t cycles)
if (cycles < data->min)
data->min = cycles;
if (cycles > data->max)
data->max = cycles;
data->sum += cycles;
#define MEASURE(variable, statement) { \
start_timing(); \
statement; \
cyclecount_update(&variable, end_timing()); \
#define PRINT(variable, label) { \
print_value(label " min", variable.min); \
print_value(label " max", variable.max); \
print_value(label " avg", variable.sum / variable.count); \
static cyclecount_t exp_cycles = CYCLECOUNT_INIT;
static cyclecount_t sqrt_cycles = CYCLECOUNT_INIT;
static cyclecount_t add_cycles = CYCLECOUNT_INIT;
static cyclecount_t sub_cycles = CYCLECOUNT_INIT;
static cyclecount_t div_cycles = CYCLECOUNT_INIT;
static cyclecount_t mul_cycles = CYCLECOUNT_INIT;
#ifndef NO_FLOAT
static cyclecount_t float_sqrtf_cycles = CYCLECOUNT_INIT;
static cyclecount_t float_add_cycles = CYCLECOUNT_INIT;
static cyclecount_t float_sub_cycles = CYCLECOUNT_INIT;
static cyclecount_t float_div_cycles = CYCLECOUNT_INIT;
static cyclecount_t float_mul_cycles = CYCLECOUNT_INIT;
static fix16_t delta(fix16_t result, fix16_t expected)
// Ignore overflow errors when the detection is turned off
if (expected == fix16_min)
return 0;
if (result >= expected)
return result - expected;
return expected - result;
const fix16_t max_delta = 1;
const fix16_t max_delta = 0;
int main()
int i;
print_value("Timestamp bias", end_timing());
for (i = 0; i < TESTCASES1_COUNT; i++)
fix16_t input = testcases1[i].a;
fix16_t result;
fix16_t expected = testcases1[i].sqrt;
MEASURE(sqrt_cycles, result = fix16_sqrt(input));
if (input > 0 && delta(result, expected) > max_delta)
print_value("Failed SQRT, i", i);
print_value("Failed SQRT, input", input);
print_value("Failed SQRT, output", result);
print_value("Failed SQRT, expected", expected);
expected = testcases1[i].exp;
MEASURE(exp_cycles, result = fix16_exp(input));
if (delta(result, expected) > 400)
print_value("Failed EXP, i", i);
print_value("Failed EXP, input", input);
print_value("Failed EXP, output", result);
print_value("Failed EXP, expected", expected);
PRINT(sqrt_cycles, "fix16_sqrt");
PRINT(exp_cycles, "fix16_exp");
for (i = 0; i < TESTCASES2_COUNT; i++)
fix16_t a = testcases2[i].a;
fix16_t b = testcases2[i].b;
volatile fix16_t result;
fix16_t expected = testcases2[i].add;
MEASURE(add_cycles, result = fix16_add(a, b));
if (delta(result, expected) > max_delta)
print_value("Failed ADD, i", i);
print_value("Failed ADD, a", a);
print_value("Failed ADD, b", b);
print_value("Failed ADD, output", result);
print_value("Failed ADD, expected", expected);
expected = testcases2[i].sub;
MEASURE(sub_cycles, result = fix16_sub(a, b));
if (delta(result, expected) > max_delta)
print_value("Failed SUB, i", i);
print_value("Failed SUB, a", a);
print_value("Failed SUB, b", b);
print_value("Failed SUB, output", result);
print_value("Failed SUB, expected", expected);
expected = testcases2[i].mul;
MEASURE(mul_cycles, result = fix16_mul(a, b));
if (delta(result, expected) > max_delta)
print_value("Failed MUL, i", i);
print_value("Failed MUL, a", a);
print_value("Failed MUL, b", b);
print_value("Failed MUL, output", result);
print_value("Failed MUL, expected", expected);
if (b != 0)
expected = testcases2[i].div;
MEASURE(div_cycles, result = fix16_div(a, b));
if (delta(result, expected) > max_delta)
print_value("Failed DIV, i", i);
print_value("Failed DIV, a", a);
print_value("Failed DIV, b", b);
print_value("Failed DIV, output", result);
print_value("Failed DIV, expected", expected);
PRINT(add_cycles, "fix16_add");
PRINT(sub_cycles, "fix16_sub");
PRINT(mul_cycles, "fix16_mul");
PRINT(div_cycles, "fix16_div");
/* Compare with floating point performance */
#ifndef NO_FLOAT
for (i = 0; i < TESTCASES1_COUNT; i++)
float input = fix16_to_float(testcases1[i].a);
volatile float result;
MEASURE(float_sqrtf_cycles, result = sqrtf(input));
PRINT(float_sqrtf_cycles, "float sqrtf");
for (i = 0; i < TESTCASES2_COUNT; i++)
float a = fix16_to_float(testcases2[i].a);
float b = fix16_to_float(testcases2[i].b);
volatile float result;
MEASURE(float_add_cycles, result = a + b);
MEASURE(float_sub_cycles, result = a - b);
MEASURE(float_mul_cycles, result = a * b);
if (b != 0)
MEASURE(float_div_cycles, result = a / b);
PRINT(float_add_cycles, "float add");
PRINT(float_sub_cycles, "float sub");
PRINT(float_mul_cycles, "float mul");
PRINT(float_div_cycles, "float div");
return 0;

View File

@ -0,0 +1,140 @@
'''This script precalculates the correct solutions for a set of test numbers,
and writes them to testcases.c. This is aimed for running the tests on-target,
therefore it doesn't test all the cases or use floating point math, but
instead generates a ~10k binary.
The tests are chosen randomly, so there is quite good chance to eventually
catch most errors. Because the list is not regenerated automatically, the
functioning of the benchmark application is still deterministic and easy
to debug.
import math
import random
import struct
# Fix16 scaling factor
scale = 65536.
# Fix16 overflow indicator
overflow = -2**31
def f16_to_float(val):
return val / scale
def float_to_f16(val):
val = int(round(val * scale))
if val >= 2**31 or val < -2**31:
val = overflow
return val
def to_ui32(val):
return struct.unpack('I', struct.pack('i', val))[0]
testcases = [
# Small numbers
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-1, -2, -3, -4, -5, -6, -7, -8, -9, -10,
# Integer numbers
0x10000, -0x10000, 0x20000, -0x20000, 0x30000, -0x30000,
0x40000, -0x40000, 0x50000, -0x50000, 0x60000, -0x60000,
# Fractions (1/2, 1/4, 1/8)
0x8000, -0x8000, 0x4000, -0x4000, 0x2000, -0x2000,
# Problematic carry
0xFFFF, -0xFFFF, 0x1FFFF, -0x1FFFF, 0x3FFFF, -0x3FFFF,
# Smallest and largest values
0x7FFFFFFF, -0x80000000
for i in range(10):
# Large random numbers
testcases.append(random.randint(-0x80000000, 0x7FFFFFFF))
# Small random numbers
testcases.append(random.randint(-100000, 100000))
# Tiny random numbers
testcases.append(random.randint(-200, 200))
out = open("testcases.c", "w")
/* Automatically generated testcases for fix16 operations
* See for the generator.
#include <fix16.h>
typedef struct {
// Input
fix16_t a;
// Correct output
fix16_t sqrt;
fix16_t exp;
} fix16_1op_testcase;
typedef struct {
// Inputs
fix16_t a;
fix16_t b;
// Correct output
fix16_t add;
fix16_t sub;
fix16_t mul;
fix16_t div;
} fix16_2op_testcase;
#define TESTCASES1_COUNT (sizeof(testcases1)/sizeof(testcases1[0]))
#define TESTCASES2_COUNT (sizeof(testcases2)/sizeof(testcases2[0]))
# Write testcases for 1-operand functions
out.write('static const fix16_1op_testcase testcases1[] = {\n')
for i in range(10):
a = random.choice(testcases)
if a >= 0:
sqrt = float_to_f16(math.sqrt(f16_to_float(a)))
sqrt = 0
exp = float_to_f16(math.exp(f16_to_float(a)))
except OverflowError:
exp = 0x7FFFFFFF
out.write(' {0x%08x, 0x%08x, 0x%08x}, // %d\n'
% (to_ui32(a), to_ui32(sqrt), to_ui32(exp), i))
# Write testcases for 2-operand functions
out.write('static const fix16_2op_testcase testcases2[] = {\n')
for i in range(50):
a = random.choice(testcases)
b = random.choice(testcases)
add = float_to_f16(f16_to_float(a) + f16_to_float(b))
sub = float_to_f16(f16_to_float(a) - f16_to_float(b))
mul = float_to_f16(f16_to_float(a) * f16_to_float(b))
if b != 0:
div = float_to_f16(f16_to_float(a) / f16_to_float(b))
div = 0
out.write(' {0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x}, // %d\n'
% (to_ui32(a), to_ui32(b), to_ui32(add), to_ui32(sub), to_ui32(mul), to_ui32(div), i))

View File

@ -0,0 +1,32 @@
#include "interface.h"
#include <stdint.h>
#include <stdio.h>
// This targets an ARM Cortex M3 core using QEmu LM3S6965 emulation.
#define STBASE 0xE000E000
#define STCTRL (*(volatile uint32_t*)(0x010 + STBASE))
#define STRELOAD (*(volatile uint32_t*)(0x014 + STBASE))
#define STCURRENT (*(volatile uint32_t*)(0x018 + STBASE))
void interface_init()
void start_timing()
uint16_t end_timing()
return 0x00FFFFFF - STCURRENT - 4;
void print_value(const char *label, int32_t value)
printf("%-20s %ld\n", label, value);

View File

@ -0,0 +1,39 @@
#include <avr/io.h>
#include <stdio.h>
#include "interface.h"
#include <stdint.h>
#define special_output_port (*((volatile char *)0x20))
static int output_char(char c, FILE *stream)
special_output_port = c;
return 0;
static FILE mystdout = FDEV_SETUP_STREAM(output_char, NULL, _FDEV_SETUP_WRITE);
void interface_init()
// Set timer 1 to count cycles
TCCR1B = 1;
// Set output to simulator
stdout = &mystdout;
stderr = &mystdout;
void start_timing()
TCNT1 = 0;
uint16_t end_timing()
return TCNT1 - 9;
void print_value(const char *label, int32_t value)
printf("%-20s %ld\n", label, value);

benchmarks/interface.h Normal file
View File

@ -0,0 +1,16 @@
// This file defines the hardware or simulator interface that will be used to
// measure timings and report results.
#include <stdint.h>
// Initialize
void interface_init();
// Reset timer/counter/something
void start_timing();
// Return the number of clock cycles passed since start_timing();
uint16_t end_timing();
// Print a value to console, along with a descriptive label
void print_value(const char *label, int32_t value);