diff --git a/benchmarks/Makefile b/benchmarks/Makefile
new file mode 100644
index 0000000..efd450e
--- /dev/null
+++ b/benchmarks/Makefile
@@ -0,0 +1,33 @@
+# These are testcases & benchmarks for the library on the target processors
+# (currently ARM Cortex M3 and AVR). They are a bit tricky to run, as they
+# depend on specific simulator versions.
+
+FILES = benchmark.c ../libfixmath/fix16.c ../libfixmath/fix16_sqrt.c ../libfixmath/fix16_exp.c
+
+CFLAGS = -DFIXMATH_NO_OVERFLOW -DFIXMATH_NO_ROUNDING -ffast-math -I../libfixmath
+
+testcases.c: generate_testcases.py
+	python $<
+
+benchmark-arm.elf: $(FILES) interface-arm.c testcases.c
+	# Note: this needs hacked QEmu that "makes no sense":
+	# https://bugs.launchpad.net/qemu/+bug/696094
+	arm-none-eabi-gcc -mcpu=cortex-m3 -mthumb -T generic-m-hosted.ld \
+		-Wall -O2 $(CFLAGS) \
+		-o $@ -I .. $(FILES) interface-arm.c -lm
+
+run-benchmark-arm: benchmark-arm.elf
+	qemu-system-arm -cpu cortex-m3 -icount 0 -device armv7m_nvic \
+		-nographic -monitor null -serial null \
+		-semihosting -kernel $<
+
+benchmark-avr.elf: $(FILES) interface-avr.c testcases.c
+	avr-gcc -Wall -mmcu=atmega128 $(CFLAGS) \
+		-Wall -O2 -DFIXMATH_OPTIMIZE_8BIT \
+		-o $@ -I .. $(FILES) interface-avr.c
+
+run-benchmark-avr: benchmark-avr.elf
+	# Note: this needs simulavrxx 1.0rc0 or newer
+	simulavr -d atmega128 -f $< -W 0x20,- -T exit
+
+
diff --git a/benchmarks/benchmark.c b/benchmarks/benchmark.c
new file mode 100644
index 0000000..de83388
--- /dev/null
+++ b/benchmarks/benchmark.c
@@ -0,0 +1,214 @@
+#ifndef NO_FLOAT
+#include <math.h>
+#endif
+
+#include <fix16.h>
+#include "interface.h"
+#include <stdio.h>
+
+/* Autogenerated testcases */
+#include "testcases.c"
+
+/* Tools for profiling */
+
+typedef struct {
+    uint32_t min;
+    uint32_t max;
+    uint32_t sum;
+    uint32_t count;
+} cyclecount_t;
+
+// Initializer for cyclecount_t structure.
+// Max is initialized to 0 and min is 2^32-1 so that first call to cyclecount_update will set them.
+#define CYCLECOUNT_INIT {0xFFFFFFFF, 0, 0, 0}
+
+// Update cyclecount_t structure after a single measurement has been made.
+static void cyclecount_update(cyclecount_t *data, uint32_t cycles)
+{
+    if (cycles < data->min)
+        data->min = cycles;
+    if (cycles > data->max)
+        data->max = cycles;
+    
+    data->sum += cycles;
+    data->count++; 
+}
+
+#define MEASURE(variable, statement) { \
+    start_timing(); \
+    statement; \
+    cyclecount_update(&variable, end_timing()); \
+}
+
+#define PRINT(variable, label) { \
+    print_value(label " min", variable.min); \
+    print_value(label " max", variable.max); \
+    print_value(label " avg", variable.sum / variable.count); \
+}
+
+static cyclecount_t exp_cycles = CYCLECOUNT_INIT;
+static cyclecount_t sqrt_cycles = CYCLECOUNT_INIT;
+static cyclecount_t add_cycles = CYCLECOUNT_INIT;
+static cyclecount_t sub_cycles = CYCLECOUNT_INIT;
+static cyclecount_t div_cycles = CYCLECOUNT_INIT;
+static cyclecount_t mul_cycles = CYCLECOUNT_INIT;
+
+#ifndef NO_FLOAT
+static cyclecount_t float_sqrtf_cycles = CYCLECOUNT_INIT;
+static cyclecount_t float_add_cycles = CYCLECOUNT_INIT;
+static cyclecount_t float_sub_cycles = CYCLECOUNT_INIT;
+static cyclecount_t float_div_cycles = CYCLECOUNT_INIT;
+static cyclecount_t float_mul_cycles = CYCLECOUNT_INIT;
+#endif
+
+static fix16_t delta(fix16_t result, fix16_t expected)
+{
+#ifdef FIXMATH_NO_OVERFLOW
+    // Ignore overflow errors when the detection is turned off
+    if (expected == fix16_min)
+        return 0;
+#endif
+
+    if (result >= expected)
+    {
+        return result - expected;
+    }
+    else
+    {
+        return expected - result;
+    }
+}
+
+#ifdef FIXMATH_NO_ROUNDING
+const fix16_t max_delta = 1;
+#else
+const fix16_t max_delta = 0;
+#endif
+
+int main()
+{
+    int i;
+    interface_init();
+    
+    start_timing();
+    print_value("Timestamp bias", end_timing());
+    
+    for (i = 0; i < TESTCASES1_COUNT; i++)
+    {
+        fix16_t input = testcases1[i].a;
+        fix16_t result;
+        fix16_t expected = testcases1[i].sqrt;
+        MEASURE(sqrt_cycles, result = fix16_sqrt(input));
+        
+        if (input > 0 && delta(result, expected) > max_delta)
+        {
+            print_value("Failed SQRT, i", i);
+            print_value("Failed SQRT, input", input);
+            print_value("Failed SQRT, output", result);
+            print_value("Failed SQRT, expected", expected);
+        }
+        
+        expected = testcases1[i].exp;
+        MEASURE(exp_cycles, result = fix16_exp(input));
+        
+        if (delta(result, expected) > 400)
+        {
+            print_value("Failed EXP, i", i);
+            print_value("Failed EXP, input", input);
+            print_value("Failed EXP, output", result);
+            print_value("Failed EXP, expected", expected);
+        }
+    }
+    PRINT(sqrt_cycles, "fix16_sqrt");
+    PRINT(exp_cycles, "fix16_exp");
+
+    for (i = 0; i < TESTCASES2_COUNT; i++)
+    {
+        fix16_t a = testcases2[i].a;
+        fix16_t b = testcases2[i].b;
+        volatile fix16_t result;
+        
+        fix16_t expected = testcases2[i].add;
+        MEASURE(add_cycles, result = fix16_add(a, b));
+        if (delta(result, expected) > max_delta)
+        {
+            print_value("Failed ADD, i", i);
+            print_value("Failed ADD, a", a);
+            print_value("Failed ADD, b", b);
+            print_value("Failed ADD, output", result);
+            print_value("Failed ADD, expected", expected);
+        }
+        
+        expected = testcases2[i].sub;
+        MEASURE(sub_cycles, result = fix16_sub(a, b));
+        if (delta(result, expected) > max_delta)
+        {
+            print_value("Failed SUB, i", i);
+            print_value("Failed SUB, a", a);
+            print_value("Failed SUB, b", b);
+            print_value("Failed SUB, output", result);
+            print_value("Failed SUB, expected", expected);
+        }
+        
+        expected = testcases2[i].mul;
+        MEASURE(mul_cycles, result = fix16_mul(a, b));
+        if (delta(result, expected) > max_delta)
+        {
+            print_value("Failed MUL, i", i);
+            print_value("Failed MUL, a", a);
+            print_value("Failed MUL, b", b);
+            print_value("Failed MUL, output", result);
+            print_value("Failed MUL, expected", expected);
+        }
+        
+        if (b != 0)
+        {
+            expected = testcases2[i].div;
+            MEASURE(div_cycles, result = fix16_div(a, b));
+            if (delta(result, expected) > max_delta)
+            {
+                print_value("Failed DIV, i", i);
+                print_value("Failed DIV, a", a);
+                print_value("Failed DIV, b", b);
+                print_value("Failed DIV, output", result);
+                print_value("Failed DIV, expected", expected);
+            }
+        }
+    }
+    PRINT(add_cycles, "fix16_add");
+    PRINT(sub_cycles, "fix16_sub");
+    PRINT(mul_cycles, "fix16_mul");
+    PRINT(div_cycles, "fix16_div");
+    
+    /* Compare with floating point performance */
+#ifndef NO_FLOAT
+    for (i = 0; i < TESTCASES1_COUNT; i++)
+    {
+        float input = fix16_to_float(testcases1[i].a);
+        volatile float result;
+        MEASURE(float_sqrtf_cycles, result = sqrtf(input));
+    }
+    PRINT(float_sqrtf_cycles, "float sqrtf");
+    
+    for (i = 0; i < TESTCASES2_COUNT; i++)
+    {
+        float a = fix16_to_float(testcases2[i].a);
+        float b = fix16_to_float(testcases2[i].b);
+        volatile float result;
+        MEASURE(float_add_cycles, result = a + b);
+        MEASURE(float_sub_cycles, result = a - b);
+        MEASURE(float_mul_cycles, result = a * b);
+        
+        if (b != 0)
+        {
+            MEASURE(float_div_cycles, result = a / b);
+        }
+    }
+    PRINT(float_add_cycles, "float add");
+    PRINT(float_sub_cycles, "float sub");
+    PRINT(float_mul_cycles, "float mul");
+    PRINT(float_div_cycles, "float div");
+#endif    
+
+    return 0;
+}
diff --git a/benchmarks/generate_testcases.py b/benchmarks/generate_testcases.py
new file mode 100644
index 0000000..560ba4d
--- /dev/null
+++ b/benchmarks/generate_testcases.py
@@ -0,0 +1,140 @@
+'''This script precalculates the correct solutions for a set of test numbers,
+and writes them to testcases.c. This is aimed for running the tests on-target,
+therefore it doesn't test all the cases or use floating point math, but
+instead generates a ~10k binary.
+
+The tests are chosen randomly, so there is quite good chance to eventually
+catch most errors. Because the list is not regenerated automatically, the
+functioning of the benchmark application is still deterministic and easy
+to debug.
+'''
+
+import math
+import random
+import struct
+
+# Fix16 scaling factor
+scale = 65536.
+
+# Fix16 overflow indicator
+overflow = -2**31
+
+def f16_to_float(val):
+    return val / scale
+
+def float_to_f16(val):
+    val = int(round(val * scale))
+    if val >= 2**31 or val < -2**31:
+        val = overflow
+    return val
+
+def to_ui32(val):
+    return struct.unpack('I', struct.pack('i', val))[0]
+
+testcases = [
+    # Small numbers
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+    -1, -2, -3, -4, -5, -6, -7, -8, -9, -10,
+    
+    # Integer numbers
+    0x10000, -0x10000, 0x20000, -0x20000, 0x30000, -0x30000,
+    0x40000, -0x40000, 0x50000, -0x50000, 0x60000, -0x60000,
+    
+    # Fractions (1/2, 1/4, 1/8)
+    0x8000, -0x8000, 0x4000, -0x4000, 0x2000, -0x2000,
+    
+    # Problematic carry
+    0xFFFF, -0xFFFF, 0x1FFFF, -0x1FFFF, 0x3FFFF, -0x3FFFF,
+    
+    # Smallest and largest values
+    0x7FFFFFFF, -0x80000000
+]
+    
+for i in range(10):
+    # Large random numbers
+    testcases.append(random.randint(-0x80000000, 0x7FFFFFFF))
+    
+    # Small random numbers
+    testcases.append(random.randint(-100000, 100000))
+    
+    # Tiny random numbers
+    testcases.append(random.randint(-200, 200))
+
+out = open("testcases.c", "w")
+
+out.write('''
+/* Automatically generated testcases for fix16 operations
+ * See generate_testcases.py for the generator.
+ */
+
+#include <fix16.h>
+
+typedef struct {
+    // Input
+    fix16_t a;
+    
+    // Correct output
+    fix16_t sqrt;
+    fix16_t exp;
+} fix16_1op_testcase;
+
+typedef struct {
+    // Inputs
+    fix16_t a;
+    fix16_t b;
+    
+    // Correct output
+    fix16_t add;
+    fix16_t sub;
+    fix16_t mul;
+    fix16_t div;
+} fix16_2op_testcase;
+
+#define TESTCASES1_COUNT (sizeof(testcases1)/sizeof(testcases1[0]))
+#define TESTCASES2_COUNT (sizeof(testcases2)/sizeof(testcases2[0]))
+
+''')
+
+# Write testcases for 1-operand functions
+
+out.write('static const fix16_1op_testcase testcases1[] = {\n')
+
+for i in range(10):
+    a = random.choice(testcases)
+    if a >= 0:
+        sqrt = float_to_f16(math.sqrt(f16_to_float(a)))
+    else:
+        sqrt = 0
+    
+    try:
+        exp = float_to_f16(math.exp(f16_to_float(a)))
+    except OverflowError:
+        exp = 0x7FFFFFFF
+        
+    out.write('    {0x%08x, 0x%08x, 0x%08x}, // %d\n'
+    % (to_ui32(a), to_ui32(sqrt), to_ui32(exp), i))
+
+out.write('};\n\n')
+
+# Write testcases for 2-operand functions
+
+out.write('static const fix16_2op_testcase testcases2[] = {\n')
+
+for i in range(50):
+    a = random.choice(testcases)
+    b = random.choice(testcases)
+    
+    add = float_to_f16(f16_to_float(a) + f16_to_float(b))
+    sub = float_to_f16(f16_to_float(a) - f16_to_float(b))
+    mul = float_to_f16(f16_to_float(a) * f16_to_float(b))
+    if b != 0:
+        div = float_to_f16(f16_to_float(a) / f16_to_float(b))
+    else:
+        div = 0
+    out.write('    {0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x, 0x%08x}, // %d\n'
+        % (to_ui32(a), to_ui32(b), to_ui32(add), to_ui32(sub), to_ui32(mul), to_ui32(div), i))
+
+out.write('};\n\n')
+
+out.close()
+    
diff --git a/benchmarks/interface-arm.c b/benchmarks/interface-arm.c
new file mode 100644
index 0000000..cd37979
--- /dev/null
+++ b/benchmarks/interface-arm.c
@@ -0,0 +1,32 @@
+#include "interface.h"
+#include <stdint.h>
+#include <stdio.h>
+
+// This targets an ARM Cortex M3 core using QEmu LM3S6965 emulation.
+#define STBASE 0xE000E000
+#define STCTRL (*(volatile uint32_t*)(0x010 + STBASE))
+#define STRELOAD (*(volatile uint32_t*)(0x014 + STBASE))
+#define STCURRENT (*(volatile uint32_t*)(0x018 + STBASE))
+
+
+void interface_init()
+{
+    STRELOAD = 0x00FFFFFF;
+    STCTRL = 5;
+}
+
+void start_timing()
+{
+     STCURRENT = 0;
+}
+
+uint16_t end_timing()
+{
+     return 0x00FFFFFF - STCURRENT - 4;
+}
+
+void print_value(const char *label, int32_t value)
+{
+    printf("%-20s %ld\n", label, value);
+}
+
diff --git a/benchmarks/interface-avr.c b/benchmarks/interface-avr.c
new file mode 100644
index 0000000..02731aa
--- /dev/null
+++ b/benchmarks/interface-avr.c
@@ -0,0 +1,39 @@
+#include <avr/io.h>
+#include <stdio.h>
+#include "interface.h"
+#include <stdint.h>
+
+#define special_output_port (*((volatile char *)0x20))
+static int output_char(char c, FILE *stream)
+{
+    special_output_port = c;
+    return 0;
+}
+
+static FILE mystdout = FDEV_SETUP_STREAM(output_char, NULL, _FDEV_SETUP_WRITE);
+
+void interface_init()
+{
+    // Set timer 1 to count cycles
+    TCCR1B = 1;
+    
+    // Set output to simulator
+    stdout = &mystdout;
+    stderr = &mystdout;
+}
+
+
+void start_timing()
+{
+    TCNT1 = 0;
+}
+
+uint16_t end_timing()
+{
+    return TCNT1 - 9;
+}
+
+void print_value(const char *label, int32_t value)
+{
+    printf("%-20s %ld\n", label, value);
+}
diff --git a/benchmarks/interface.h b/benchmarks/interface.h
new file mode 100644
index 0000000..f8c5117
--- /dev/null
+++ b/benchmarks/interface.h
@@ -0,0 +1,16 @@
+// This file defines the hardware or simulator interface that will be used to
+// measure timings and report results.
+
+#include <stdint.h>
+
+// Initialize
+void interface_init();
+
+// Reset timer/counter/something
+void start_timing();
+
+// Return the number of clock cycles passed since start_timing();
+uint16_t end_timing();
+
+// Print a value to console, along with a descriptive label
+void print_value(const char *label, int32_t value);