Benchmark suite using simulators

This commit is contained in:
Petteri.Aimonen 2012-02-27 16:40:45 +00:00
parent e929442f71
commit 90973e833d
6 changed files with 474 additions and 0 deletions

33
benchmarks/Makefile Normal file
View File

@ -0,0 +1,33 @@
# These are testcases & benchmarks for the library on the target processors
# (currently ARM Cortex M3 and AVR). They are a bit tricky to run, as they
# depend on specific simulator versions.
FILES = benchmark.c ../libfixmath/fix16.c ../libfixmath/fix16_sqrt.c ../libfixmath/fix16_exp.c
CFLAGS = -DFIXMATH_NO_OVERFLOW -DFIXMATH_NO_ROUNDING -ffast-math -I../libfixmath
testcases.c: generate_testcases.py
python $<
benchmark-arm.elf: $(FILES) interface-arm.c testcases.c
# Note: this needs hacked QEmu that "makes no sense":
# https://bugs.launchpad.net/qemu/+bug/696094
arm-none-eabi-gcc -mcpu=cortex-m3 -mthumb -T generic-m-hosted.ld \
-Wall -O2 $(CFLAGS) \
-o $@ -I .. $(FILES) interface-arm.c -lm
run-benchmark-arm: benchmark-arm.elf
qemu-system-arm -cpu cortex-m3 -icount 0 -device armv7m_nvic \
-nographic -monitor null -serial null \
-semihosting -kernel $<
benchmark-avr.elf: $(FILES) interface-avr.c testcases.c
avr-gcc -Wall -mmcu=atmega128 $(CFLAGS) \
-Wall -O2 -DFIXMATH_OPTIMIZE_8BIT \
-o $@ -I .. $(FILES) interface-avr.c
run-benchmark-avr: benchmark-avr.elf
# Note: this needs simulavrxx 1.0rc0 or newer
simulavr -d atmega128 -f $< -W 0x20,- -T exit

214
benchmarks/benchmark.c Normal file
View File

@ -0,0 +1,214 @@
#ifndef NO_FLOAT
#include <math.h>
#endif
#include <fix16.h>
#include "interface.h"
#include <stdio.h>
/* Autogenerated testcases */
#include "testcases.c"
/* Tools for profiling */
typedef struct {
uint32_t min;
uint32_t max;
uint32_t sum;
uint32_t count;
} cyclecount_t;
// Initializer for cyclecount_t structure.
// Max is initialized to 0 and min is 2^32-1 so that first call to cyclecount_update will set them.
#define CYCLECOUNT_INIT {0xFFFFFFFF, 0, 0, 0}
// Update cyclecount_t structure after a single measurement has been made.
static void cyclecount_update(cyclecount_t *data, uint32_t cycles)
{
if (cycles < data->min)
data->min = cycles;
if (cycles > data->max)
data->max = cycles;
data->sum += cycles;
data->count++;
}
#define MEASURE(variable, statement) { \
start_timing(); \
statement; \
cyclecount_update(&variable, end_timing()); \
}
#define PRINT(variable, label) { \
print_value(label " min", variable.min); \
print_value(label " max", variable.max); \
print_value(label " avg", variable.sum / variable.count); \
}
static cyclecount_t exp_cycles = CYCLECOUNT_INIT;
static cyclecount_t sqrt_cycles = CYCLECOUNT_INIT;
static cyclecount_t add_cycles = CYCLECOUNT_INIT;
static cyclecount_t sub_cycles = CYCLECOUNT_INIT;
static cyclecount_t div_cycles = CYCLECOUNT_INIT;
static cyclecount_t mul_cycles = CYCLECOUNT_INIT;
#ifndef NO_FLOAT
static cyclecount_t float_sqrtf_cycles = CYCLECOUNT_INIT;
static cyclecount_t float_add_cycles = CYCLECOUNT_INIT;
static cyclecount_t float_sub_cycles = CYCLECOUNT_INIT;
static cyclecount_t float_div_cycles = CYCLECOUNT_INIT;
static cyclecount_t float_mul_cycles = CYCLECOUNT_INIT;
#endif
static fix16_t delta(fix16_t result, fix16_t expected)
{
#ifdef FIXMATH_NO_OVERFLOW
// Ignore overflow errors when the detection is turned off
if (expected == fix16_min)
return 0;
#endif
if (result >= expected)
{
return result - expected;
}
else
{
return expected - result;
}
}
#ifdef FIXMATH_NO_ROUNDING
const fix16_t max_delta = 1;
#else
const fix16_t max_delta = 0;
#endif
int main()
{
int i;
interface_init();
start_timing();
print_value("Timestamp bias", end_timing());
for (i = 0; i < TESTCASES1_COUNT; i++)
{
fix16_t input = testcases1[i].a;
fix16_t result;
fix16_t expected = testcases1[i].sqrt;
MEASURE(sqrt_cycles, result = fix16_sqrt(input));
if (input > 0 && delta(result, expected) > max_delta)
{
print_value("Failed SQRT, i", i);
print_value("Failed SQRT, input", input);
print_value("Failed SQRT, output", result);
print_value("Failed SQRT, expected", expected);
}
expected = testcases1[i].exp;
MEASURE(exp_cycles, result = fix16_exp(input));
if (delta(result, expected) > 400)
{
print_value("Failed EXP, i", i);
print_value("Failed EXP, input", input);
print_value("Failed EXP, output", result);
print_value("Failed EXP, expected", expected);
}
}
PRINT(sqrt_cycles, "fix16_sqrt");
PRINT(exp_cycles, "fix16_exp");
for (i = 0; i < TESTCASES2_COUNT; i++)
{
fix16_t a = testcases2[i].a;
fix16_t b = testcases2[i].b;
volatile fix16_t result;
fix16_t expected = testcases2[i].add;
MEASURE(add_cycles, result = fix16_add(a, b));
if (delta(result, expected) > max_delta)
{
print_value("Failed ADD, i", i);
print_value("Failed ADD, a", a);
print_value("Failed ADD, b", b);
print_value("Failed ADD, output", result);
print_value("Failed ADD, expected", expected);
}
expected = testcases2[i].sub;
MEASURE(sub_cycles, result = fix16_sub(a, b));
if (delta(result, expected) > max_delta)
{
print_value("Failed SUB, i", i);
print_value("Failed SUB, a", a);
print_value("Failed SUB, b", b);
print_value("Failed SUB, output", result);
print_value("Failed SUB, expected", expected);
}
expected = testcases2[i].mul;
MEASURE(mul_cycles, result = fix16_mul(a, b));
if (delta(result, expected) > max_delta)
{
print_value("Failed MUL, i", i);
print_value("Failed MUL, a", a);
print_value("Failed MUL, b", b);
print_value("Failed MUL, output", result);
print_value("Failed MUL, expected", expected);
}
if (b != 0)
{
expected = testcases2[i].div;
MEASURE(div_cycles, result = fix16_div(a, b));
if (delta(result, expected) > max_delta)
{
print_value("Failed DIV, i", i);
print_value("Failed DIV, a", a);
print_value("Failed DIV, b", b);
print_value("Failed DIV, output", result);
print_value("Failed DIV, expected", expected);
}
}
}
PRINT(add_cycles, "fix16_add");
PRINT(sub_cycles, "fix16_sub");
PRINT(mul_cycles, "fix16_mul");
PRINT(div_cycles, "fix16_div");
/* Compare with floating point performance */
#ifndef NO_FLOAT
for (i = 0; i < TESTCASES1_COUNT; i++)
{
float input = fix16_to_float(testcases1[i].a);
volatile float result;
MEASURE(float_sqrtf_cycles, result = sqrtf(input));
}
PRINT(float_sqrtf_cycles, "float sqrtf");
for (i = 0; i < TESTCASES2_COUNT; i++)
{
float a = fix16_to_float(testcases2[i].a);
float b = fix16_to_float(testcases2[i].b);
volatile float result;
MEASURE(float_add_cycles, result = a + b);
MEASURE(float_sub_cycles, result = a - b);
MEASURE(float_mul_cycles, result = a * b);
if (b != 0)
{
MEASURE(float_div_cycles, result = a / b);
}
}
PRINT(float_add_cycles, "float add");
PRINT(float_sub_cycles, "float sub");
PRINT(float_mul_cycles, "float mul");
PRINT(float_div_cycles, "float div");
#endif
return 0;
}

View File

@ -0,0 +1,140 @@
'''This script precalculates the correct solutions for a set of test numbers,
and writes them to testcases.c. This is aimed for running the tests on-target,
therefore it doesn't test all the cases or use floating point math, but
instead generates a ~10k binary.
The tests are chosen randomly, so there is quite good chance to eventually
catch most errors. Because the list is not regenerated automatically, the
functioning of the benchmark application is still deterministic and easy
to debug.
'''
import math
import random
import struct
# Fix16 scaling factor
scale = 65536.
# Fix16 overflow indicator
overflow = -2**31
def f16_to_float(val):
return val / scale
def float_to_f16(val):
val = int(round(val * scale))
if val >= 2**31 or val < -2**31:
val = overflow
return val
def to_ui32(val):
return struct.unpack('I', struct.pack('i', val))[0]
testcases = [
# Small numbers
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-1, -2, -3, -4, -5, -6, -7, -8, -9, -10,
# Integer numbers
0x10000, -0x10000, 0x20000, -0x20000, 0x30000, -0x30000,
0x40000, -0x40000, 0x50000, -0x50000, 0x60000, -0x60000,
# Fractions (1/2, 1/4, 1/8)
0x8000, -0x8000, 0x4000, -0x4000, 0x2000, -0x2000,
# Problematic carry
0xFFFF, -0xFFFF, 0x1FFFF, -0x1FFFF, 0x3FFFF, -0x3FFFF,
# Smallest and largest values
0x7FFFFFFF, -0x80000000
]
for i in range(10):
# Large random numbers
testcases.append(random.randint(-0x80000000, 0x7FFFFFFF))
# Small random numbers
testcases.append(random.randint(-100000, 100000))
# Tiny random numbers
testcases.append(random.randint(-200, 200))
out = open("testcases.c", "w")
out.write('''
/* Automatically generated testcases for fix16 operations
* See generate_testcases.py for the generator.
*/
#include <fix16.h>
typedef struct {
// Input
fix16_t a;
// Correct output
fix16_t sqrt;
fix16_t exp;
} fix16_1op_testcase;
typedef struct {
// Inputs
fix16_t a;
fix16_t b;
// Correct output
fix16_t add;
fix16_t sub;
fix16_t mul;
fix16_t div;
} fix16_2op_testcase;
#define TESTCASES1_COUNT (sizeof(testcases1)/sizeof(testcases1[0]))
#define TESTCASES2_COUNT (sizeof(testcases2)/sizeof(testcases2[0]))
''')
# Write testcases for 1-operand functions
out.write('static const fix16_1op_testcase testcases1[] = {\n')
for i in range(10):
a = random.choice(testcases)
if a >= 0:
sqrt = float_to_f16(math.sqrt(f16_to_float(a)))
else:
sqrt = 0
try:
exp = float_to_f16(math.exp(f16_to_float(a)))
except OverflowError:
exp = 0x7FFFFFFF
out.write(' {0x%08x, 0x%08x, 0x%08x}, // %d\n'
% (to_ui32(a), to_ui32(sqrt), to_ui32(exp), i))
out.write('};\n\n')
# Write testcases for 2-operand functions
out.write('static const fix16_2op_testcase testcases2[] = {\n')
for i in range(50):
a = random.choice(testcases)
b = random.choice(testcases)
add = float_to_f16(f16_to_float(a) + f16_to_float(b))
sub = float_to_f16(f16_to_float(a) - f16_to_float(b))
mul = float_to_f16(f16_to_float(a) * f16_to_float(b))
if b != 0:
div = float_to_f16(f16_to_float(a) / f16_to_float(b))
else:
div = 0
out.write(' {0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x}, // %d\n'
% (to_ui32(a), to_ui32(b), to_ui32(add), to_ui32(sub), to_ui32(mul), to_ui32(div), i))
out.write('};\n\n')
out.close()

View File

@ -0,0 +1,32 @@
#include "interface.h"
#include <stdint.h>
#include <stdio.h>
// This targets an ARM Cortex M3 core using QEmu LM3S6965 emulation.
#define STBASE 0xE000E000
#define STCTRL (*(volatile uint32_t*)(0x010 + STBASE))
#define STRELOAD (*(volatile uint32_t*)(0x014 + STBASE))
#define STCURRENT (*(volatile uint32_t*)(0x018 + STBASE))
void interface_init()
{
STRELOAD = 0x00FFFFFF;
STCTRL = 5;
}
void start_timing()
{
STCURRENT = 0;
}
uint16_t end_timing()
{
return 0x00FFFFFF - STCURRENT - 4;
}
void print_value(const char *label, int32_t value)
{
printf("%-20s %ld\n", label, value);
}

View File

@ -0,0 +1,39 @@
#include <avr/io.h>
#include <stdio.h>
#include "interface.h"
#include <stdint.h>
#define special_output_port (*((volatile char *)0x20))
static int output_char(char c, FILE *stream)
{
special_output_port = c;
return 0;
}
static FILE mystdout = FDEV_SETUP_STREAM(output_char, NULL, _FDEV_SETUP_WRITE);
void interface_init()
{
// Set timer 1 to count cycles
TCCR1B = 1;
// Set output to simulator
stdout = &mystdout;
stderr = &mystdout;
}
void start_timing()
{
TCNT1 = 0;
}
uint16_t end_timing()
{
return TCNT1 - 9;
}
void print_value(const char *label, int32_t value)
{
printf("%-20s %ld\n", label, value);
}

16
benchmarks/interface.h Normal file
View File

@ -0,0 +1,16 @@
// This file defines the hardware or simulator interface that will be used to
// measure timings and report results.
#include <stdint.h>
// Initialize
void interface_init();
// Reset timer/counter/something
void start_timing();
// Return the number of clock cycles passed since start_timing();
uint16_t end_timing();
// Print a value to console, along with a descriptive label
void print_value(const char *label, int32_t value);