Faster implementation of integer square root.

Implementation by Roland Brochard <zuzuf86@gmail.com>. Note: If you wonder how code doing multiplications can be faster than code doing just shifts and increments: I've measured it. One million square roots in 30 seconds with the new code instead of 220 seconds with the old code on a Gen7 20 MHz. That's just 30 microseconds or 600 CPU cycles per root. Code used for the measurement (by a stopwatch) in mendel.c: ... *include "dda_maths.h" *include "delay.h" int main (void) { uint32_t i, j; serial_init(); sei(); serial_writestr_P(PSTR("start\n")); for (i = 0; i < 1000000; i++) { j = int_sqrt(i); } serial_writestr_P(PSTR("done\n")); delay_ms(20); cli(); init(); ... --Traumflug
2013-09-10 22:44:37 +02:00 · 2013-09-10 22:44:37 +02:00 · af12c7a68a
parent 13ec2d7521
commit af12c7a68a
1 changed files with 36 additions and 16 deletions
--- a/dda_maths.c
+++ b/dda_maths.c
@ -135,27 +135,47 @@ uint32_t approx_distance_3(uint32_t dx, uint32_t dy, uint32_t dz) {
  \param a find square root of this number
  \return sqrt(a - 1) < returnvalue <= sqrt(a)

-  see http://www.embedded-systems.com/98/9802fe2.htm
+  This is a binary search but it uses only the minimum required bits for
+  each step.
 */
-// courtesy of http://www.embedded-systems.com/98/9802fe2.htm
 uint16_t int_sqrt(uint32_t a) {
-  uint32_t rem = 0;
-  uint32_t root = 0;
+  uint16_t b = a >> 16;
+  uint8_t c = b >> 8;
+  uint16_t x = 0;
+  uint8_t z = 0;
  uint16_t i;
+  uint8_t j;

-  for (i = 0; i < 16; i++) {
-    root <<= 1;
-    rem = ((rem << 2) + (a >> 30));
-    a <<= 2;
-    root++;
-    if (root <= rem) {
-      rem -= root;
-      root++;
-    }
-    else
-      root--;
+  for (j = 0x8; j; j >>= 1) {
+    uint8_t y2;
+
+    z |= j;
+    y2 = z * z;
+    if (y2 > c)
+      z ^= j;
  }
-  return (uint16_t) ((root >> 1) & 0xFFFFL);
+  
+  x = z << 4;
+  for(i = 0x8; i; i >>= 1) {
+    uint16_t y2;
+
+    x |= i;
+    y2 = x * x;
+    if (y2 > b)
+      x ^= i;
+  }
+  
+  x <<= 8;
+  for(i = 0x80; i; i >>= 1) {
+    uint32_t y2;
+
+    x |= i;
+    y2 = (uint32_t)x * x;
+    if (y2 > a)
+      x ^= i;
+  }
+
+  return x;
 }

 // this is an ultra-crude pseudo-logarithm routine, such that: