From af12c7a68a768b86e267efb25ed67e2ed7b8e0e7 Mon Sep 17 00:00:00 2001 From: Roland Brochard Date: Tue, 10 Sep 2013 22:44:37 +0200 Subject: [PATCH] Faster implementation of integer square root. Implementation by Roland Brochard . Note: If you wonder how code doing multiplications can be faster than code doing just shifts and increments: I've measured it. One million square roots in 30 seconds with the new code instead of 220 seconds with the old code on a Gen7 20 MHz. That's just 30 microseconds or 600 CPU cycles per root. Code used for the measurement (by a stopwatch) in mendel.c: ... *include "dda_maths.h" *include "delay.h" int main (void) { uint32_t i, j; serial_init(); sei(); serial_writestr_P(PSTR("start\n")); for (i = 0; i < 1000000; i++) { j = int_sqrt(i); } serial_writestr_P(PSTR("done\n")); delay_ms(20); cli(); init(); ... --Traumflug --- dda_maths.c | 52 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/dda_maths.c b/dda_maths.c index fe613a5..4952b1a 100644 --- a/dda_maths.c +++ b/dda_maths.c @@ -135,27 +135,47 @@ uint32_t approx_distance_3(uint32_t dx, uint32_t dy, uint32_t dz) { \param a find square root of this number \return sqrt(a - 1) < returnvalue <= sqrt(a) - see http://www.embedded-systems.com/98/9802fe2.htm + This is a binary search but it uses only the minimum required bits for + each step. */ -// courtesy of http://www.embedded-systems.com/98/9802fe2.htm uint16_t int_sqrt(uint32_t a) { - uint32_t rem = 0; - uint32_t root = 0; + uint16_t b = a >> 16; + uint8_t c = b >> 8; + uint16_t x = 0; + uint8_t z = 0; uint16_t i; + uint8_t j; - for (i = 0; i < 16; i++) { - root <<= 1; - rem = ((rem << 2) + (a >> 30)); - a <<= 2; - root++; - if (root <= rem) { - rem -= root; - root++; - } - else - root--; + for (j = 0x8; j; j >>= 1) { + uint8_t y2; + + z |= j; + y2 = z * z; + if (y2 > c) + z ^= j; } - return (uint16_t) ((root >> 1) & 0xFFFFL); + + x = z << 4; + for(i = 0x8; i; i >>= 1) { + uint16_t y2; + + x |= i; + y2 = x * x; + if (y2 > b) + x ^= i; + } + + x <<= 8; + for(i = 0x80; i; i >>= 1) { + uint32_t y2; + + x |= i; + y2 = (uint32_t)x * x; + if (y2 > a) + x ^= i; + } + + return x; } // this is an ultra-crude pseudo-logarithm routine, such that: