Merging a bunch of separately developed functions:

fix16_mul, fix16_div, fix16_sqrt. They are faster & more accurate than the previous versions. Closes issue #13. Includes unittests for the functions in question, runnable by typing 'make' in the unittests folder.
2012-01-26 15:43:30 +00:00 · 2012-01-26 15:43:30 +00:00 · 1076285d26
commit 1076285d26
parent 681fd5aea1
6 changed files with 968 additions and 320 deletions
--- a/libfixmath/fix16.c
+++ b/libfixmath/fix16.c
@ -2,243 +2,462 @@
 #include "int64.h"
 /* Subtraction and addition with overflow detection.
 * The versions without overflow detection are inlined in the header.
 */
 #ifndef FIXMATH_NO_OVERFLOW
 fix16_t fix16_add(fix16_t a, fix16_t b)
 {
  // Use unsigned integers because overflow with signed integers is
  // an undefined operation (http://www.airs.com/blog/archives/120).
  uint32_t _a = a, _b = b;
  uint32_t sum = _a + _b;
-fix16_t fix16_sadd(fix16_t inArg0, fix16_t inArg1) {
+  // Overflow can only happen if sign of a == sign of b, and then
-	fix16_t tempResult = (inArg0 + inArg1);
+  // it causes sign of sum != sign of a.
-	if((tempResult > 0) && (inArg0 < 0) && (inArg1 < 0))
+  if (!((_a ^ _b) & 0x80000000) && ((_a ^ sum) & 0x80000000))
-		return fix16_min;
+    return fix16_overflow;
-	if((tempResult < 0) && (inArg0 > 0) && (inArg1 > 0))
+  
-		return fix16_max;
+  return sum;
 	return tempResult;
 }
 fix16_t fix16_sub(fix16_t a, fix16_t b)
 {
  uint32_t _a = a, _b = b;
  uint32_t diff = _a - _b;
-
+  // Overflow can only happen if sign of a != sign of b, and then
-#if defined(__arm__) || defined(_ARM) || defined(__thumb2__)
+  // it causes sign of diff != sign of a.
-fix16_t fix16_mul(int32_t inArg0, int32_t inArg1) {
+  if (((_a ^ _b) & 0x80000000) && ((_a ^ diff) & 0x80000000))
-	register fix16_t res;
+    return fix16_overflow;
-	register fix16_t tmp;
+  
-	asm(
+  return diff;
 	"smull %0, %3, %1, %2\n\t"
 	#ifndef FIXMATH_NO_ROUNDING
 	"add   %0, %0, #0x8000\n\t"
 	#endif
 	"mov   %0, %0, lsr #16\n\t"
 	"orr   %0, %0, %3, lsl #16"
 	: "=r"(res)
 	: "r"(inArg0), "r"(inArg1), "r"(tmp)
 	: "r0");
 	return res;
 }
 #else
 fix16_t fix16_mul(fix16_t inArg0, fix16_t inArg1) {
 	#ifndef FIXMATH_NO_64BIT
 	int64_t tempResult = ((int64_t)inArg0 * (int64_t)inArg1);
 	#ifndef FIXMATH_NO_ROUNDING
 	tempResult += (fix16_one >> 1);
 	#endif
 	tempResult >>= 16;
 	return tempResult;
 	#else
 	 int16_t hi[2] = { (inArg0 >> 16),    (inArg1 >> 16)    };
 	uint16_t lo[2] = { (inArg0 & 0xFFFF), (inArg1 & 0xFFFF) };
-	 int32_t r_hi = hi[0] * hi[1];
+/* Saturating arithmetic */
-	 int32_t r_md = (hi[0] * lo[1]) + (hi[1] * lo[0]);
+fix16_t fix16_sadd(fix16_t a, fix16_t b)
-	uint32_t r_lo = lo[0] * lo[1];
+{
-	#ifndef FIXMATH_NO_ROUNDING
+  fix16_t result = fix16_add(a, b);
 	r_lo += 0xFFFF;
 	#endif
-	r_md += (r_hi & 0xFFFF) << 16;
+  if (result == fix16_overflow)
-	r_md += (r_lo >> 16);
+    return (a > 0) ? fix16_max : fix16_min;
-	return r_md;
+  return result;
-	#endif
+}  
 fix16_t fix16_ssub(fix16_t a, fix16_t b)
 {
  fix16_t result = fix16_sub(a, b);
  if (result == fix16_overflow)
    return (a > 0) ? fix16_max : fix16_min;
  return result;
 }
 #endif
 /* 64-bit implementation for fix16_mul. Fastest version for e.g. ARM Cortex M3.
 * Performs a 32*32 -> 64bit multiplication. The middle 32 bits are the result,
 * bottom 16 bits are used for rounding, and upper 16 bits are used for overflow
 * detection.
 */
 #if !defined(FIXMATH_NO_64BIT) && !defined(FIXMATH_OPTIMIZE_8BIT)
 fix16_t fix16_mul(fix16_t inArg0, fix16_t inArg1)
 {
  int64_t product = (int64_t)inArg0 * inArg1;
  #ifndef FIXMATH_NO_OVERFLOW
  // The upper 17 bits should all be the same (the sign).
  uint32_t upper = (product >> 47);
  #endif
  if (product < 0)
  {
    #ifndef FIXMATH_NO_OVERFLOW
    if (~upper)
        return fix16_overflow;
    #endif
    #ifndef FIXMATH_NO_ROUNDING
    // This adjustment is required in order to round -1/2 correctly
    product--;
    #endif
  }
  else
  {
    #ifndef FIXMATH_NO_OVERFLOW
    if (upper)
        return fix16_overflow;
    #endif
  }
  #ifdef FIXMATH_NO_ROUNDING
  return product >> 16;
  #else
  fix16_t result = product >> 16;
  result += (product & 0x8000) >> 15;
  return result;
  #endif
 }
 #endif
 /* 32-bit implementation of fix16_mul. Potentially fast on 16-bit processors,
 * and this is a relatively good compromise for compilers that do not support
 * uint64_t. Uses 16*16->32bit multiplications.
 */
 #if defined(FIXMATH_NO_64BIT) && !defined(FIXMATH_OPTIMIZE_8BIT)
 fix16_t fix16_mul(fix16_t inArg0, fix16_t inArg1)
 {
  // Each argument is divided to 16-bit parts.
  //          AB
  //      *   CD
  // -----------
  //          BD  16 * 16 -> 32 bit products
  //         CB
  //         AD
  //        AC
  //       |----| 64 bit product
  int32_t A = (inArg0 >> 16), C = (inArg1 >> 16);
  uint32_t B = (inArg0 & 0xFFFF), D = (inArg1 & 0xFFFF);
  int32_t AC = A*C;
  int32_t AD_CB = A*D + C*B;
  uint32_t BD = B*D;
  int32_t product_hi = AC + (AD_CB >> 16);
  // Handle carry from lower 32 bits to upper part of result.
  uint32_t ad_cb_temp = AD_CB << 16;
  uint32_t product_lo = BD + ad_cb_temp;
  if (product_lo < BD)
    product_hi++;
 #ifndef FIXMATH_NO_OVERFLOW
  // The upper 17 bits should all be the same (the sign).
  if (product_hi >> 31 != product_hi >> 15)
    return fix16_overflow;
 #endif
 #ifdef FIXMATH_NO_ROUNDING
  return (product_hi << 16) | (product_lo >> 16);
 #else
  // Subtracting 0x8000 (= 0.5) and then using signed right shift
  // achieves proper rounding to result-1, except in the corner
  // case of negative numbers and lowest word = 0x8000.
  // To handle that, we also have to subtract 1 for negative numbers.
  uint32_t product_lo_tmp = product_lo;
  product_lo -= 0x8000;
  product_lo -= (uint32_t)product_hi >> 31;
  if (product_lo > product_lo_tmp)
    product_hi--;
  // Discard the lowest 16 bits. Note that this is not exactly the same
  // as dividing by 0x10000. For example if product = -1, result will
  // also be -1 and not 0. This is compensated by adding +1 to the result
  // and compensating this in turn in the rounding above.
  fix16_t result = (product_hi << 16) | (product_lo >> 16);
  result += 1;
  return result;
 #endif
 }
 #endif
 /* 8-bit implementation of fix16_mul. Fastest on e.g. Atmel AVR.
 * Uses 8*8->16bit multiplications, and also skips any bytes that
 * are zero.
 */
 #if defined(FIXMATH_OPTIMIZE_8BIT)
 fix16_t fix16_mul(fix16_t inArg0, fix16_t inArg1)
 {
  uint32_t _a = (inArg0 >= 0) ? inArg0 : (-inArg0);
  uint32_t _b = (inArg1 >= 0) ? inArg1 : (-inArg1);
  uint8_t va[4] = {_a, (_a >> 8), (_a >> 16), (_a >> 24)};
  uint8_t vb[4] = {_b, (_b >> 8), (_b >> 16), (_b >> 24)};
  uint32_t low = 0;
  uint32_t mid = 0;
  // Result column i depends on va[0..i] and vb[i..0]
  #ifndef FIXMATH_NO_OVERFLOW
  // i = 6
  if (va[3] && vb[3]) return fix16_overflow;
  #endif
  // i = 5
  if (va[2] && vb[3]) mid += (uint16_t)va[2] * vb[3];
  if (va[3] && vb[2]) mid += (uint16_t)va[3] * vb[2];
  mid <<= 8;
  // i = 4
  if (va[1] && vb[3]) mid += (uint16_t)va[1] * vb[3];
  if (va[2] && vb[2]) mid += (uint16_t)va[2] * vb[2];
  if (va[3] && vb[1]) mid += (uint16_t)va[3] * vb[1];
  #ifndef FIXMATH_NO_OVERFLOW
  if (mid & 0xFF000000) return fix16_overflow;
  #endif
  mid <<= 8;
  // i = 3
  if (va[0] && vb[3]) mid += (uint16_t)va[0] * vb[3];
  if (va[1] && vb[2]) mid += (uint16_t)va[1] * vb[2];
  if (va[2] && vb[1]) mid += (uint16_t)va[2] * vb[1];
  if (va[3] && vb[0]) mid += (uint16_t)va[3] * vb[0];
  #ifndef FIXMATH_NO_OVERFLOW
  if (mid & 0xFF000000) return fix16_overflow;
  #endif
  mid <<= 8;
  // i = 2
  if (va[0] && vb[2]) mid += (uint16_t)va[0] * vb[2];
  if (va[1] && vb[1]) mid += (uint16_t)va[1] * vb[1];
  if (va[2] && vb[0]) mid += (uint16_t)va[2] * vb[0];    
  // i = 1
  if (va[0] && vb[1]) low += (uint16_t)va[0] * vb[1];
  if (va[1] && vb[0]) low += (uint16_t)va[1] * vb[0];
  low <<= 8;
  // i = 0
  if (va[0] && vb[0]) low += (uint16_t)va[0] * vb[0];
  #ifndef FIXMATH_NO_ROUNDING
  low += 0x8000;
  #endif
  mid += (low >> 16);
  #ifndef FIXMATH_NO_OVERFLOW
  if (mid & 0x80000000)
    return fix16_overflow;
  #endif
  fix16_t result = mid;
  /* Figure out the sign of result */
  if ((inArg0 >= 0) != (inArg1 >= 0))
  {
    result = -result;
  }
  return result;
 }
 #endif
 #ifndef FIXMATH_NO_OVERFLOW
 /* Wrapper around fix16_mul to add saturating arithmetic. */
 fix16_t fix16_smul(fix16_t inArg0, fix16_t inArg1) {
-	#ifndef FIXMATH_NO_64BIT
+  fix16_t result = fix16_mul(inArg0, inArg1);
-	int64_t tempResult = ((int64_t)inArg0 * (int64_t)inArg1);
+  
-	#ifndef FIXMATH_NO_ROUNDING
+  if (result == fix16_overflow)
-	tempResult += (fix16_one >> 1);
+  {
-	#endif
+    if ((inArg0 >= 0) == (inArg1 >= 0))
-	tempResult >>= 16;
+      return fix16_max;
-	if(tempResult < fix16_min)
+    else
-		return fix16_min;
+      return fix16_min;
-	if(tempResult > fix16_max)
+  }
-		return fix16_max;
+  
-	return tempResult;
+  return result;
 	#else
 	 int16_t hi[2] = { (inArg0 >> 16),    (inArg1 >> 16)    };
 	 int32_t r_hi = hi[0] * hi[1];
 	 if(r_hi >> 16)
 		return (r_hi < 0 ? fix16_min : fix16_max);
 	uint16_t lo[2] = { (inArg0 & 0xFFFF), (inArg1 & 0xFFFF) };
 	 int32_t r_md = (hi[0] * lo[1]) + (hi[1] * lo[0]);
 	uint32_t r_lo = lo[0] * lo[1];
 	#ifndef FIXMATH_NO_ROUNDING
 	r_lo += 0xFFFF;
 	#endif
 	r_md += (r_hi & 0xFFFF) << 16;
 	r_md += (r_lo >> 16);
 	return r_md;
 	#endif
 }
 #endif
-
+/* 32-bit implementation of fix16_div. Fastest version for e.g. ARM Cortex M3.
-
+ * Performs 32-bit divisions repeatedly to reduce the remainder. For this to
-fix16_t fix16_div(fix16_t inArg0, fix16_t inArg1) {
+ * be efficient, the processor has to have 32-bit hardware division.
-	#ifndef FIXMATH_NO_64BIT
+ */
-	int64_t tempResult = inArg0;
+#if !defined(FIXMATH_OPTIMIZE_8BIT)
-	tempResult <<= 16;
+#ifdef __GNUC__
-	#ifndef FIXMATH_NO_ROUNDING
+// Count leading zeros, using processor-specific instruction if available.
-	tempResult += (inArg1 >> 1);
+#define clz(x) __builtin_clzl(x)
-	#endif
+#else
-	tempResult /= inArg1;
+static uint8_t clz(uint32_t x)
-	return tempResult;
+{
-	#else
+  uint8_t result = 0;
-	int neg = ((inArg0 < 0) != (inArg1 < 0));
+  if (x == 0) return 32;
-	inArg0 = (inArg0 < 0 ? -inArg0 : inArg0);
+  while (!(x & 0xF0000000)) { result += 4; x <<= 4; }
-	inArg1 = (inArg1 < 0 ? -inArg1 : inArg1);
+  while (!(x & 0x80000000)) { result += 1; x <<= 1; }
-
+  return result;
 	while(((inArg0 | inArg1) & 1) == 0) {
 		inArg0 >>= 1;
 		inArg1 >>= 1;
 	}
 	uint32_t r_hi = (inArg0 / inArg1);
 	uint32_t n_lo = (inArg0 % inArg1);
 	uint32_t n_hi = (n_lo >> 16);
 	n_lo <<= 16;
 	uint32_t n_lo_orig = n_lo;
 	uint32_t i, arg;
 	for(i = 1, arg = inArg1; ((n_lo | arg) & 1) == 0; i <<= 1) {
 		n_lo = ((n_lo >> 1) | (n_hi << 31));
 		n_hi =  (n_hi >> 1);
 		arg >>= 1;
 	}
 	n_lo = n_lo_orig;
 	uint32_t res = 0;
 	if(n_hi) {
 		uint32_t arg_lo, arg_hi;
 		for(arg_lo = inArg1; (arg_lo >> 31) == 0; arg_lo <<= 1, i <<= 1);
 		for(arg_hi = (arg_lo >> 31), arg_lo <<= 1, i <<= 1; arg_hi < n_hi; arg_hi = (arg_hi << 1) | (arg_lo >> 31), arg_lo <<= 1, i <<= 1);
 		do {
 			arg_lo = (arg_lo >> 1) | (arg_hi << 31);
 			arg_hi = (arg_hi >> 1);
 			i >>= 1;
 			if(arg_hi < n_hi) {
 				n_hi -= arg_hi;
 				if(arg_lo > n_lo)
 					n_hi--;
 				n_lo -= arg_lo;
 				res += i;
 			} else if((arg_hi == n_hi) && (arg_lo <= n_lo)) {
 				n_hi -= arg_hi;
 				n_lo -= arg_lo;
 				res += i;
 			}
 		} while(n_hi);
 	}
 	res += (n_lo / inArg1);
 	#ifndef FIXMATH_NO_ROUNDING
 	if((n_lo % inArg1) >= (inArg1 >> 1))
 		res++;
 	#endif
 	res += (r_hi << 16);
 	return (neg ? -res : res);
 	#endif
 }
 #endif
 fix16_t fix16_div(fix16_t a, fix16_t b)
 {
  // This uses a hardware 32/32 bit division multiple times, until we have
  // computed all the bits in (a<<17)/b. Usually this takes 1-3 iterations.
  if (b == 0)
      return fix16_min;
  uint32_t remainder = (a >= 0) ? a : (-a);
  uint32_t divider = (b >= 0) ? b : (-b);
  uint32_t quotient = 0;
  int bit_pos = 17;
  // Kick-start the division a bit.
  // This improves speed in the worst-case scenarios where N and D are large
  // It gets a lower estimate for the result by N/(D >> 17 + 1).
  if (divider & 0xFFF00000)
  {
    uint32_t shifted_div = ((divider >> 17) + 1);
    quotient = remainder / shifted_div;
    remainder -= ((uint64_t)quotient * divider) >> 17;
  }
  // If the divider is divisible by 2^n, take advantage of it.
  while (!(divider & 0xF) && bit_pos >= 4)
  {
    divider >>= 4;
    bit_pos -= 4;
  }
  while (remainder && bit_pos >= 0)
  {
    // Shift remainder as much as we can without overflowing
    int shift = clz(remainder);
    if (shift > bit_pos) shift = bit_pos;
    remainder <<= shift;
    bit_pos -= shift;
    uint32_t div = remainder / divider;
    remainder = remainder % divider;
    quotient += div << bit_pos;
    #ifndef FIXMATH_NO_OVERFLOW
    if (div & ~(0xFFFFFFFF >> bit_pos))
        return fix16_overflow;
    #endif
    remainder <<= 1;
    bit_pos--;
  }
  #ifndef FIXMATH_NO_ROUNDING
  // Quotient is always positive so rounding is easy
  quotient++;
  #endif
  fix16_t result = quotient >> 1;
  // Figure out the sign of the result
  if ((a ^ b) & 0x80000000)
  {
    #ifndef FIXMATH_NO_OVERFLOW
    if (result == fix16_min)
        return fix16_overflow;
    #endif
    result = -result;
  }
  return result;
 }
 #endif
 /* Alternative 32-bit implementation of fix16_div. Fastest on e.g. Atmel AVR.
 * This does the division manually, and is therefore good for processors that
 * do not have hardware division.
 */
 #if defined(FIXMATH_OPTIMIZE_8BIT)
 fix16_t fix16_div(fix16_t a, fix16_t b)
 {
  // This uses the basic binary restoring division algorithm.
  // It appears to be faster to do the whole division manually than
  // trying to compose a 64-bit divide out of 32-bit divisions on
  // platforms without hardware divide.
  if (b == 0)
    return fix16_min;
  uint32_t remainder = (a >= 0) ? a : (-a);
  uint32_t divider = (b >= 0) ? b : (-b);
  uint32_t quotient = 0;
  uint32_t bit = 0x10000;
  /* The algorithm requires D >= R */
  while (divider < remainder)
  {
    divider <<= 1;
    bit <<= 1;
  }
  #ifndef FIXMATH_NO_OVERFLOW
  if (!bit)
    return fix16_overflow;
  #endif
  if (divider & 0x80000000)
  {
    // Perform one step manually to avoid overflows later.
    // We know that divider's bottom bit is 0 here.
    if (remainder >= divider)
    {
        quotient |= bit;
        remainder -= divider;
    }
    divider >>= 1;
    bit >>= 1;
  }
  /* Main division loop */
  while (bit && remainder)
  {
    if (remainder >= divider)
    {
        quotient |= bit;
        remainder -= divider;
    }
    remainder <<= 1;
    bit >>= 1;
  }   
  #ifndef FIXMATH_NO_ROUNDING
  if (remainder >= divider)
  {
    quotient++;
  }
  #endif
  fix16_t result = quotient;
  /* Figure out the sign of result */
  if ((a ^ b) & 0x80000000)
  {
    #ifndef FIXMATH_NO_OVERFLOW
    if (result == fix16_min)
        return fix16_overflow;
    #endif
    result = -result;
  }
  return result;
 }
 #endif
 #ifndef FIXMATH_NO_OVERFLOW
 /* Wrapper around fix16_div to add saturating arithmetic. */
 fix16_t fix16_sdiv(fix16_t inArg0, fix16_t inArg1) {
-	if(inArg1 == 0) {
+  fix16_t result = fix16_div(inArg0, inArg1);
-		if(inArg0 < 0)
+  
-			return fix16_min;
+  if (result == fix16_overflow)
-		return fix16_max;
+  {
-	}
+    if ((inArg0 >= 0) == (inArg1 >= 0))
-	#ifndef FIXMATH_NO_64BIT
+      return fix16_max;
-	int64_t tempResult = inArg0;
+    else
-	tempResult <<= 16;
+      return fix16_min;
-	#ifndef FIXMATH_NO_ROUNDING
+  }
-	tempResult += (inArg1 >> 1);
+  
-	#endif
+  return result;
 	tempResult /= inArg1;
 	if(tempResult < fix16_min)
 		return fix16_min;
 	if(tempResult > fix16_max)
 		return fix16_max;
 	return tempResult;
 	#else
 	int neg = ((inArg0 < 0) != (inArg1 < 0));
 	inArg0 = (inArg0 < 0 ? -inArg0 : inArg0);
 	inArg1 = (inArg1 < 0 ? -inArg1 : inArg1);
 	while(((inArg0 | inArg1) & 1) == 0) {
 		inArg0 >>= 1;
 		inArg1 >>= 1;
 	}
 	uint32_t r_hi = (inArg0 / inArg1);
 	if(r_hi > (neg ? 32768 : 32767))
 		return (neg ? fix16_min : fix16_max);
 	uint32_t n_lo = (inArg0 % inArg1);
 	uint32_t n_hi = (n_lo >> 16);
 	n_lo <<= 16;
 	uint32_t i, arg;
 	for(i = 1, arg = inArg1; ((n_lo | arg) & 1) == 0; i <<= 1) {
 		n_lo = ((n_lo >> 1) | (n_hi << 31));
 		n_hi =  (n_hi >> 1);
 		arg >>= 1;
 	}
 	uint32_t res = 0;
 	if(n_hi) {
 		uint32_t arg_lo, arg_hi;
 		for(arg_lo = inArg1; (arg_lo >> 31) == 0; arg_lo <<= 1, i <<= 1);
 		for(arg_hi = (arg_lo >> 31), arg_lo <<= 1, i <<= 1; arg_hi < n_hi; arg_hi = (arg_hi << 1) | (arg_lo >> 31), arg_lo <<= 1, i <<= 1);
 		do {
 			arg_lo = (arg_lo >> 1) | (arg_hi << 31);
 			arg_hi = (arg_hi >> 1);
 			i >>= 1;
 			if(arg_hi < n_hi) {
 				n_hi -= arg_hi;
 				if(arg_lo > n_lo)
 					n_hi--;
 				n_lo -= arg_lo;
 				res += i;
 			} else if((arg_hi == n_hi) && (arg_lo <= n_lo)) {
 				n_hi -= arg_hi;
 				n_lo -= arg_lo;
 				res += i;
 			}
 		} while(n_hi);
 	}
 	#ifndef FIXMATH_NO_ROUNDING
 	n_lo += (inArg1 >> 1);
 	#endif
 	res += (n_lo / inArg1);
 	res += (r_hi << 16);
 	return (neg ? -res : res);
 	#endif
 }
-
+#endif
 fix16_t fix16_lerp8(fix16_t inArg0, fix16_t inArg1, uint8_t inFract) {
 	int64_t tempOut = int64_mul_i32_i32(inArg0, ((1 << 8) - inFract));
--- a/libfixmath/fix16.h
+++ b/libfixmath/fix16.h
@ -18,65 +18,83 @@ static const fix16_t THREE_PI_DIV_4 = 0x00025B2F;       /*!< Fix16 value of 3PI/
 static const fix16_t fix16_max = 0x7FFFFFFF; /*!< the maximum value of fix16_t */
 static const fix16_t fix16_min = 0x80000000; /*!< the minimum value of fix16_t */
 static const fix16_t fix16_overflow = 0x80000000; /*!< the value used to indicate overflows when FIXMATH_NO_OVERFLOW is not specified */
 static const fix16_t fix16_pi  = 205887;     /*!< fix16_t value of pi */
 static const fix16_t fix16_e   = 178145;     /*!< fix16_t value of e */
 static const fix16_t fix16_one = 0x00010000; /*!< fix16_t value of 1 */
 /* Conversion functions between fix16_t and float/integer.
 * These are inlined to allow compiler to optimize away constant numbers
 */
 static inline fix16_t fix16_from_int(int a) { return a * fix16_one; }
 static inline float fix16_to_float(fix16_t a) { return (float)a / fix16_one; }
 static inline double fix16_to_dbl(fix16_t a) { return (double)a / fix16_one; }
 static inline int fix16_to_int(fix16_t a)
 {
 #ifdef FIXMATH_NO_ROUNDING
-/*! Converts a double to a fix16_t and returns the result. */
+    return a >> 16;
 static inline fix16_t fix16_from_dbl(const double inVal)  { return (fix16_t)(inVal * 65536.0); }
 /*! Converts a float to a fix16_t and returns the result. */
 static inline fix16_t fix16_from_float(const float inVal) { return (fix16_t)(inVal * 65536.0f); }
 #else
-/*! Converts a double to a fix16_t and returns the result. */
+    if (a >= 0)
-static inline fix16_t fix16_from_dbl(const double inVal)  { return (fix16_t)((inVal * 65536.0) + 0.5); }
+        return (a + fix16_one / 2) / fix16_one;
-/*! Converts a float to a fix16_t and returns the result. */
+    else
-static inline fix16_t fix16_from_float(const float inVal) { return (fix16_t)((inVal * 65536.0f) + 0.5f); }
+        return (a - fix16_one / 2) / fix16_one;
 #endif
-/*! Converts a signed integer to a fix16_t and returns the result. */
+}
 static inline fix16_t fix16_from_int(const int32_t inVal) { return (inVal << 16); }
-/*! Coverts a fix16_t to a double and returns the result. */
+static inline fix16_t fix16_from_float(float a)
-static inline double  fix16_to_dbl(const fix16_t inVal)   { return ((double)inVal / 65536.0); }
+{
-/*! Converts a fix16_t to a float and returns the result. */
+    float temp = a * fix16_one;
-static inline float   fix16_to_float(const fix16_t inVal) { return ((float)inVal / 65536.0f); }
+#ifndef FIXMATH_NO_ROUNDING
-/*! Converts a fix16_t to a signed integer and returns the result. */
+    temp += (temp >= 0) ? 0.5f : -0.5f;
-static inline int32_t fix16_to_int(const fix16_t inVal)   { return ((inVal + 0x00008000) >> 16); }
+#endif
    return (fix16_t)temp;
 }
 static inline fix16_t fix16_from_dbl(double a)
 {
    double temp = a * fix16_one;
 #ifndef FIXMATH_NO_ROUNDING
    temp += (temp >= 0) ? 0.5f : -0.5f;
 #endif
    return (fix16_t)temp;
 }
 /* Subtraction and addition with (optional) overflow detection. */
 #ifdef FIXMATH_NO_OVERFLOW
 static inline fix16_t fix16_add(fix16_t inArg0, fix16_t inArg1) { return (inArg0 + inArg1); }
 /*! Performs a saturated addition (overflow-protected) of the two given fix16_t's and returns the result.
 */
 extern fix16_t fix16_sadd(fix16_t inArg0, fix16_t inArg1);
 static inline fix16_t fix16_sub(fix16_t inArg0, fix16_t inArg1) { return (inArg0 - inArg1); }
-static inline fix16_t fix16_ssub(fix16_t inArg0, fix16_t inArg1) { return fix16_sadd(inArg0, -inArg1);}
+#else
 extern fix16_t fix16_add(fix16_t a, fix16_t b);
 extern fix16_t fix16_sub(fix16_t a, fix16_t b);
 /* Saturating arithmetic */
 extern fix16_t fix16_sadd(fix16_t a, fix16_t b);
 extern fix16_t fix16_ssub(fix16_t a, fix16_t b);
 #endif
 /*! Multiplies the two given fix16_t's and returns the result.
 */
 extern fix16_t fix16_mul(fix16_t inArg0, fix16_t inArg1);
 /*! Performs a saturated multiplication (overflow-protected) of the two given fix16_t's and returns the result.
 */
 extern fix16_t fix16_smul(fix16_t inArg0, fix16_t inArg1);
 /*! Divides the first given fix16_t by the second and returns the result.
 */
 extern fix16_t fix16_div(fix16_t inArg0, fix16_t inArg1);
 #ifndef FIXMATH_NO_OVERFLOW
 /*! Performs a saturated multiplication (overflow-protected) of the two given fix16_t's and returns the result.
 */
 extern fix16_t fix16_smul(fix16_t inArg0, fix16_t inArg1);
 /*! Performs a saturated division (overflow-protected) of the first fix16_t by the second and returns the result.
 */
 extern fix16_t fix16_sdiv(fix16_t inArg0, fix16_t inArg1);
-
+#endif
 /*! Returns the linear interpolation: (inArg0 * (1 - inFract)) + (inArg1 * inFract)
 */
--- a/libfixmath/fix16_sqrt.c
+++ b/libfixmath/fix16_sqrt.c
@ -1,76 +1,83 @@
 #include "fix16.h"
 #include "int64.h"
 #ifndef FIXMATH_NO_CACHE
 fix16_t _fix16_sqrt_cache_index[4096] = { 0 };
 fix16_t _fix16_sqrt_cache_value[4096] = { 0 };
 #endif
 fix16_t fix16_sqrt32(fix16_t inValue)
 {
 	fix16_t retval = 0;
 	fix16_t x = inValue;
 	/* Take only  Leading 1 bit */
 	x |= (x >> 1);
 	x |= (x >> 2);
 	x |= (x >> 4);
 	x |= (x >> 8);
 	x |= (x >> 16);
 	x = x & ~(x >> 1);
 	/* Avoid useless loops */
 	if (x & 0xFF000000)
 	{
 		x = 0x00800000;
 	}
 	/* Condition for numbers less than 1.0 */
 	if ( !(inValue>>16) ) x = 0x00008000;
 	while(x)
 	{
 		retval |= x;
 		if ( (uint32_t)fix16_mul(retval,retval) > (uint32_t)inValue)
 		{
 			retval &= ~x;
 		}
 		x >>= 1;
 	}
 	return retval;
 }
 /* The square root algorithm is quite directly from
 * http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Binary_numeral_system_.28base_2.29
 * An important difference is that it is split to two parts
 * in order to use only 32-bit operations.
 *
 * Note that for negative numbers we return -sqrt(-inValue).
 * Not sure if someone relies on this behaviour, but not going
 * to break it for now. It doesn't slow the code much overall.
 */
 fix16_t fix16_sqrt(fix16_t inValue) {
-	int neg = (inValue < 0);
+  uint8_t neg = (inValue < 0);
-	if(neg)
+  uint32_t num = (neg ? -inValue : inValue);
-		inValue = -inValue;
+  uint32_t result = 0;
  uint32_t bit;
  uint8_t n;
  // Many numbers will be less than 15, so
  // this gives a good balance between time spent
  // in if vs. time spent in the while loop
  // when searching for the starting value.
  if (num & 0xFFF00000)
    bit = (uint32_t)1 << 30;
  else
    bit = (uint32_t)1 << 18;
  while (bit > num) bit >>= 2;
  // The main part is executed twice, in order to avoid
  // using 64 bit values in computations.
  for (n = 0; n < 2; n++)
  {
    // First we get the top 24 bits of the answer.
    while (bit)
    {
      if (num >= result + bit)
      {
        num -= result + bit;
        result = (result >> 1) + bit;
      }
      else
      {
        result = (result >> 1);
      }
      bit >>= 2;
    }
    if (n == 0)
    {
      // Then process it again to get the lowest 8 bits.
      if (num > 65535)
      {
        // The remainder 'num' is too large to be shifted left
        // by 16, so we have to add 1 to result manually and
        // adjust 'num' accordingly.
        // num = a - (result + 0.5)^2
        //   = num + result^2 - (result + 0.5)^2
        //   = num - result - 0.5
        num -= result;
        num = (num << 16) - 0x8000;
        result = (result << 16) + 0x8000;
      }
      else
      {
        num <<= 16;
        result <<= 16;
      }
      bit = 1 << 14;
    }
  }
-	#ifndef FIXMATH_NO_CACHE
+#ifndef FIXMATH_NO_ROUNDING
-	fix16_t tempIndex = (((inValue >> 16) ^ (inValue >> 4)) & 0x00000FFF);
+  // Finally, if next bit would have been 1, round the result upwards.
-	if(_fix16_sqrt_cache_index[tempIndex] == inValue)
+  if (num > result)
-		return (neg ? -_fix16_sqrt_cache_value[tempIndex] : _fix16_sqrt_cache_value[tempIndex]);
+  {
-	#endif
+    result++;
-
+  }
-	int64_t tempOp  = int64_const((inValue >> 16), (inValue << 16));
+#endif
-	int64_t tempOut = int64_const(0, 0);
+  
-	int64_t tempOne = int64_const(0x40000000UL, 0x00000000UL);
+  return (neg ? -result : result);
 	while(int64_cmp_gt(tempOne, tempOp))
 		tempOne = int64_shift(tempOne, -2);
 	while(int64_cmp_ne(tempOne, int64_const(0, 0))) {
 		if(int64_cmp_ge(tempOp, int64_add(tempOut, tempOne))) {
 			tempOp  = int64_sub(tempOp, int64_add(tempOut, tempOne));
 			tempOut = int64_add(tempOut, int64_shift(tempOne, 1));
 		}
 		tempOut = int64_shift(tempOut, -1);
 		tempOne = int64_shift(tempOne, -2);
 	}
 	#ifndef FIXMATH_NO_CACHE
 	_fix16_sqrt_cache_index[tempIndex] = inValue;
 	_fix16_sqrt_cache_value[tempIndex] = int64_lo(tempOut);
 	#endif
 	return (neg ? -int64_lo(tempOut) : int64_lo(tempOut));
 }
--- a/unittests/Makefile
+++ b/unittests/Makefile
@ -0,0 +1,49 @@
 # Makefile for running the unittests of libfixmath.
 CC = gcc
 # Basic CFLAGS for debugging
 CFLAGS = -g -O0 -I../libfixmath -Wall -Wextra -Werror
 # The files required for tests
 FIX16_SRC = ../libfixmath/fix16.c ../libfixmath/fix16_sqrt.c \
 	../libfixmath/fix16.h
 all: run_fix16_unittests
 clean:
 	rm -f fix16_unittests_????
 # The library is tested automatically under different compilations
 # options.
 #
 # Test naming:
 # r = rounding, n = no rounding
 # o = overflow detection, n = no overflow detection
 # 64 = int64_t math, 32 = int32_t math
 run_fix16_unittests: \
 	fix16_unittests_ro64 fix16_unittests_no64 \
 	fix16_unittests_rn64 fix16_unittests_nn64 \
 	fix16_unittests_ro32 fix16_unittests_no32 \
 	fix16_unittests_rn32 fix16_unittests_nn32 \
 	fix16_unittests_ro08 fix16_unittests_no08 \
 	fix16_unittests_rn08 fix16_unittests_nn08
 	$(foreach test, $^, \
 	echo $(test) && \
 	./$(test) > /dev/null && \
 	) true
 fix16_unittests_no64: DEFINES=-DFIXMATH_NO_ROUNDING
 fix16_unittests_rn64: DEFINES=-DFIXMATH_NO_OVERFLOW
 fix16_unittests_nn64: DEFINES=-DFIXMATH_NO_ROUNDING -DFIXMATH_NO_OVERFLOW
 fix16_unittests_ro32: DEFINES=-DFIXMATH_NO_64BIT
 fix16_unittests_no32: DEFINES=-DFIXMATH_NO_ROUNDING -DFIXMATH_NO_64BIT
 fix16_unittests_rn32: DEFINES=-DFIXMATH_NO_OVERFLOW -DFIXMATH_NO_64BIT
 fix16_unittests_nn32: DEFINES=-DFIXMATH_NO_OVERFLOW -DFIXMATH_NO_ROUNDING -DFIXMATH_NO_64BIT
 fix16_unittests_ro08: DEFINES=-DFIXMATH_OPTIMIZE_8BIT
 fix16_unittests_no08: DEFINES=-DFIXMATH_NO_ROUNDING -DFIXMATH_OPTIMIZE_8BIT
 fix16_unittests_rn08: DEFINES=-DFIXMATH_NO_OVERFLOW -DFIXMATH_OPTIMIZE_8BIT
 fix16_unittests_nn08: DEFINES=-DFIXMATH_NO_OVERFLOW -DFIXMATH_NO_ROUNDING -DFIXMATH_OPTIMIZE_8BIT
 fix16_unittests_% : fix16_unittests.c $(FIX16_SRC)
 	$(CC) $(CFLAGS) $(DEFINES) -o $@ $^ -lm
--- a/unittests/fix16_unittests.c
+++ b/unittests/fix16_unittests.c
@ -0,0 +1,337 @@
 #include <fix16.h>
 #include <stdio.h>
 #include <math.h>
 #include <stdbool.h>
 #include "unittests.h"
 const fix16_t testcases[] = {
  // Small numbers
  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
  -1, -2, -3, -4, -5, -6, -7, -8, -9, -10,
  // Integer numbers
  0x10000, -0x10000, 0x20000, -0x20000, 0x30000, -0x30000,
  0x40000, -0x40000, 0x50000, -0x50000, 0x60000, -0x60000,
  // Fractions (1/2, 1/4, 1/8)
  0x8000, -0x8000, 0x4000, -0x4000, 0x2000, -0x2000,
  // Problematic carry
  0xFFFF, -0xFFFF, 0x1FFFF, -0x1FFFF, 0x3FFFF, -0x3FFFF,
  // Smallest and largest values
  0x7FFFFFFF, 0x80000000,
  // Large random numbers
  831858892, 574794913, 2147272293, -469161054, -961611615,
  1841960234, 1992698389, 520485404, 560523116, -2094993050,
  -876897543, -67813629, 2146227091, 509861939, -1073573657,
  // Small random numbers
  -14985, 30520, -83587, 41129, 42137, 58537, -2259, 84142,
  -28283, 90914, 19865, 33191, 81844, -66273, -63215, -44459,
  -11326, 84295, 47515, -39324,
  // Tiny random numbers
  -171, -359, 491, 844, 158, -413, -422, -737, -575, -330,
  -376, 435, -311, 116, 715, -1024, -487, 59, 724, 993
 };
 #define TESTCASES_COUNT (sizeof(testcases)/sizeof(testcases[0]))
 #define delta(a,b) (((a)>=(b)) ? (a)-(b) : (b)-(a))
 #ifdef FIXMATH_NO_ROUNDING
 const fix16_t max_delta = 1;
 #else
 const fix16_t max_delta = 0;
 #endif
 int main()
 {
  int status = 0;
  {
    COMMENT("Testing basic multiplication");
    TEST(fix16_mul(fix16_from_int(5), fix16_from_int(5)) == fix16_from_int(25));
    TEST(fix16_mul(fix16_from_int(-5), fix16_from_int(5)) == fix16_from_int(-25));
    TEST(fix16_mul(fix16_from_int(-5), fix16_from_int(-5)) == fix16_from_int(25));
    TEST(fix16_mul(fix16_from_int(5), fix16_from_int(-5)) == fix16_from_int(-25));
  }
 #ifndef FIXMATH_NO_ROUNDING
  {
    COMMENT("Testing multiplication rounding corner cases");
    TEST(fix16_mul(0, 10) == 0);
    TEST(fix16_mul(2, 0x8000) == 1);
    TEST(fix16_mul(-2, 0x8000) == -1);
    TEST(fix16_mul(3, 0x8000) == 2);
    TEST(fix16_mul(-3, 0x8000) == -2);
    TEST(fix16_mul(2, 0x7FFF) == 1);
    TEST(fix16_mul(-2, 0x7FFF) == -1);
    TEST(fix16_mul(2, 0x8001) == 1);
    TEST(fix16_mul(-2, 0x8001) == -1);
  }
 #endif
  {
    unsigned int i, j;
    int failures = 0;
    COMMENT("Running testcases for multiplication");
    for (i = 0; i < TESTCASES_COUNT; i++)
    {
      for (j = 0; j < TESTCASES_COUNT; j++)
      {
        fix16_t a = testcases[i];
        fix16_t b = testcases[j];
        fix16_t result = fix16_mul(a, b);
        double fa = fix16_to_dbl(a);
        double fb = fix16_to_dbl(b);
        fix16_t fresult = fix16_from_dbl(fa * fb);
        double max = fix16_to_dbl(fix16_max);
        double min = fix16_to_dbl(fix16_min);
        if (delta(fresult, result) > max_delta)
        {
          if (fa * fb > max || fa * fb < min)
          {
            #ifndef FIXMATH_NO_OVERFLOW
            if (result != fix16_overflow)
            {
              printf("\n%d * %d overflow not detected!\n", a, b);
              failures++;
            }
            #endif
            // Legitimate overflow
            continue;
          }
          printf("\n%d * %d = %d\n", a, b, result);
          printf("%f * %f = %d\n", fa, fb, fresult);
          failures++;
        }
      }
    }
    TEST(failures == 0);
  }
  {
    COMMENT("Testing basic division");
    TEST(fix16_div(fix16_from_int(15), fix16_from_int(5)) == fix16_from_int(3));
    TEST(fix16_div(fix16_from_int(-15), fix16_from_int(5)) == fix16_from_int(-3));
    TEST(fix16_div(fix16_from_int(-15), fix16_from_int(-5)) == fix16_from_int(3));
    TEST(fix16_div(fix16_from_int(15), fix16_from_int(-5)) == fix16_from_int(-3));
  }
 #ifndef FIXMATH_NO_ROUNDING
  {
    COMMENT("Testing division rounding corner cases");
    TEST(fix16_div(0, 10) == 0);
    TEST(fix16_div(1, fix16_from_int(2)) == 1);
    TEST(fix16_div(-1, fix16_from_int(2)) == -1);
    TEST(fix16_div(1, fix16_from_int(-2)) == -1);
    TEST(fix16_div(-1, fix16_from_int(-2)) == 1);
    TEST(fix16_div(3, fix16_from_int(2)) == 2);
    TEST(fix16_div(-3, fix16_from_int(2)) == -2);
    TEST(fix16_div(3, fix16_from_int(-2)) == -2);
    TEST(fix16_div(-3, fix16_from_int(-2)) == 2);
    TEST(fix16_div(2, 0x7FFF) == 4);
    TEST(fix16_div(-2, 0x7FFF) == -4);
    TEST(fix16_div(2, 0x8001) == 4);
    TEST(fix16_div(-2, 0x8001) == -4);
  }
 #endif
  {
    unsigned int i, j;
    int failures = 0;
    COMMENT("Running testcases for division");
    for (i = 0; i < TESTCASES_COUNT; i++)
    {
      for (j = 0; j < TESTCASES_COUNT; j++)
      {
        fix16_t a = testcases[i];
        fix16_t b = testcases[j];
        // We don't require a solution for /0 :)
        if (b == 0) continue;
        fix16_t result = fix16_div(a, b);
        double fa = fix16_to_dbl(a);
        double fb = fix16_to_dbl(b);
        fix16_t fresult = fix16_from_dbl(fa / fb);
        double max = fix16_to_dbl(fix16_max);
        double min = fix16_to_dbl(fix16_min);        
        if (delta(fresult, result) > max_delta)
        {
          if (fa / fb > max || fa / fb < min)
          {
            #ifndef FIXMATH_NO_OVERFLOW
            if (result != fix16_overflow)
            {
              printf("\n%d / %d overflow not detected!\n", a, b);
              failures++;
            }
            #endif
            // Legitimate overflow
            continue;
          }
          printf("\n%d / %d = %d\n", a, b, result);
          printf("%f / %f = %d\n", fa, fb, fresult);
          failures++;
        }
      }
    }
    TEST(failures == 0);
  }
  {
    unsigned int i, j;
    int failures = 0;
    COMMENT("Running testcases for addition");
    for (i = 0; i < TESTCASES_COUNT; i++)
    {
      for (j = 0; j < TESTCASES_COUNT; j++)
      {
        fix16_t a = testcases[i];
        fix16_t b = testcases[j];
        fix16_t result = fix16_add(a, b);
        double fa = fix16_to_dbl(a);
        double fb = fix16_to_dbl(b);
        fix16_t fresult = fix16_from_dbl(fa + fb);
        double max = fix16_to_dbl(fix16_max);
        double min = fix16_to_dbl(fix16_min);
        if (delta(fresult, result) > max_delta)
        {
          if (fa + fb > max || fa + fb < min)
          {
            #ifndef FIXMATH_NO_OVERFLOW
            if (result != fix16_overflow)
            {
              printf("\n%d + %d overflow not detected!\n", a, b);
              failures++;
            }
            #endif
            // Legitimate overflow
            continue;
          }
          printf("\n%d + %d = %d\n", a, b, result);
          printf("%f + %f = %d\n", fa, fb, fresult);
          failures++;
        }
      }
    }
    TEST(failures == 0);
  }
  {
    unsigned int i, j;
    int failures = 0;
    COMMENT("Running testcases for subtraction");
    for (i = 0; i < TESTCASES_COUNT; i++)
    {
      for (j = 0; j < TESTCASES_COUNT; j++)
      {
        fix16_t a = testcases[i];
        fix16_t b = testcases[j];
        fix16_t result = fix16_sub(a, b);
        double fa = fix16_to_dbl(a);
        double fb = fix16_to_dbl(b);
        fix16_t fresult = fix16_from_dbl(fa - fb);
        double max = fix16_to_dbl(fix16_max);
        double min = fix16_to_dbl(fix16_min);
        if (delta(fresult, result) > max_delta)
        {
          if (fa - fb > max || fa - fb < min)
          {
            #ifndef FIXMATH_NO_OVERFLOW
            if (result != fix16_overflow)
            {
              printf("\n%d - %d overflow not detected!\n", a, b);
              failures++;
            }
            #endif
            // Legitimate overflow
            continue;
          }
          printf("\n%d - %d = %d\n", a, b, result);
          printf("%f - %f = %d\n", fa, fb, fresult);
          failures++;
        }
      }
    }
    TEST(failures == 0);
  }
  {
    COMMENT("Testing basic square roots");
    TEST(fix16_sqrt(fix16_from_int(16)) == fix16_from_int(4));
    TEST(fix16_sqrt(fix16_from_int(100)) == fix16_from_int(10));
    TEST(fix16_sqrt(fix16_from_int(1)) == fix16_from_int(1));
  }
 #ifndef FIXMATH_NO_ROUNDING
  {
    COMMENT("Testing square root rounding corner cases");
    TEST(fix16_sqrt(214748302) == 3751499);
    TEST(fix16_sqrt(214748303) == 3751499);
    TEST(fix16_sqrt(214748359) == 3751499);
    TEST(fix16_sqrt(214748360) == 3751500);
  }
 #endif
  {
    unsigned int i;
    int failures = 0;
    COMMENT("Running test cases for square root");
    for (i = 0; i < TESTCASES_COUNT; i++)
    {
      fix16_t a = testcases[i];
      if (a < 0) continue;
      fix16_t result = fix16_sqrt(a);
      double fa = fix16_to_dbl(a);
      fix16_t fresult = fix16_from_dbl(sqrt(fa));
      if (delta(fresult, result) > max_delta)
      {
        printf("\nfix16_sqrt(%d) = %d\n", a, result);
        printf("sqrt(%f) = %d\n", fa, fresult);
        failures++;
      }
    }
    TEST(failures == 0);
  }
  if (status != 0)
    fprintf(stdout, "\n\nSome tests FAILED!\n");
  return status;
 }
--- a/unittests/unittests.h
+++ b/unittests/unittests.h
@ -0,0 +1,18 @@
 #include <stdio.h>
 #define COMMENT(x) printf("\n----" x "----\n");
 #define STR(x) #x
 #define STR2(x) STR(x)
 #define TEST(x) \
    if (!(x)) { \
        fflush(stdout); \
        fflush(stderr); \
        fprintf(stderr, "\033[31;1mFAILED:\033[22;39m " __FILE__ ":" STR2(__LINE__) " " #x "\n"); \
        status = 1; \
    } else { \
        fflush(stdout); \
        fflush(stderr); \
        printf("\033[32;1mOK:\033[22;39m " #x "\n"); \
    }