From 2ad7517e27efe04ce0cfb034409a6a3b4d322cd7 Mon Sep 17 00:00:00 2001
From: Markus Hitter <mah@jump-ing.de>
Date: Sun, 6 Jul 2014 23:17:06 +0200
Subject: [PATCH] preprocessor_math.h, SQRT(): take a better initial guess.

Now results are apparently accurate across the whole uint32 range.
At least, this test passes with all numbers being exact:

  #include "preprocessor_math.h"
  #include <math.h>
  ... in main() ...
  sersendf_P(PSTR("0:  %lu  %lu\n"), (uint32_t)SQRT(0), (uint32_t)sqrt(0));
  sersendf_P(PSTR("1:  %lu  %lu\n"), (uint32_t)SQRT(1), (uint32_t)sqrt(1));
  sersendf_P(PSTR("2:  %lu  %lu\n"), (uint32_t)SQRT(2), (uint32_t)sqrt(2));
  sersendf_P(PSTR("3:  %lu  %lu\n"), (uint32_t)SQRT(3), (uint32_t)sqrt(3));
  sersendf_P(PSTR("4:  %lu  %lu\n"), (uint32_t)SQRT(4), (uint32_t)sqrt(4));
  sersendf_P(PSTR("5:  %lu  %lu\n"), (uint32_t)SQRT(5), (uint32_t)sqrt(5));
  sersendf_P(PSTR("6:  %lu  %lu\n"), (uint32_t)SQRT(6), (uint32_t)sqrt(6));
  sersendf_P(PSTR("7:  %lu  %lu\n"), (uint32_t)SQRT(7), (uint32_t)sqrt(7));
  sersendf_P(PSTR("8:  %lu  %lu\n"), (uint32_t)SQRT(8), (uint32_t)sqrt(8));
  sersendf_P(PSTR("9:  %lu  %lu\n"), (uint32_t)SQRT(9), (uint32_t)sqrt(9));
  sersendf_P(PSTR("10:  %lu  %lu\n"), (uint32_t)SQRT(10), (uint32_t)sqrt(10));
  sersendf_P(PSTR("20:  %lu  %lu\n"), (uint32_t)SQRT(20), (uint32_t)sqrt(20));
  sersendf_P(PSTR("30:  %lu  %lu\n"), (uint32_t)SQRT(30), (uint32_t)sqrt(30));
  sersendf_P(PSTR("40:  %lu  %lu\n"), (uint32_t)SQRT(40), (uint32_t)sqrt(40));
  sersendf_P(PSTR("50:  %lu  %lu\n"), (uint32_t)SQRT(50), (uint32_t)sqrt(50));
  sersendf_P(PSTR("60:  %lu  %lu\n"), (uint32_t)SQRT(60), (uint32_t)sqrt(60));
  sersendf_P(PSTR("70:  %lu  %lu\n"), (uint32_t)SQRT(70), (uint32_t)sqrt(70));
  sersendf_P(PSTR("80:  %lu  %lu\n"), (uint32_t)SQRT(80), (uint32_t)sqrt(80));
  sersendf_P(PSTR("90:  %lu  %lu\n"), (uint32_t)SQRT(90), (uint32_t)sqrt(90));
  sersendf_P(PSTR("100:  %lu  %lu\n"), (uint32_t)SQRT(100), (uint32_t)sqrt(100));
  sersendf_P(PSTR("200:  %lu  %lu\n"), (uint32_t)SQRT(200), (uint32_t)sqrt(200));
  sersendf_P(PSTR("300:  %lu  %lu\n"), (uint32_t)SQRT(300), (uint32_t)sqrt(300));
  sersendf_P(PSTR("400:  %lu  %lu\n"), (uint32_t)SQRT(400), (uint32_t)sqrt(400));
  sersendf_P(PSTR("500:  %lu  %lu\n"), (uint32_t)SQRT(500), (uint32_t)sqrt(500));
  sersendf_P(PSTR("600:  %lu  %lu\n"), (uint32_t)SQRT(600), (uint32_t)sqrt(600));
  sersendf_P(PSTR("700:  %lu  %lu\n"), (uint32_t)SQRT(700), (uint32_t)sqrt(700));
  sersendf_P(PSTR("800:  %lu  %lu\n"), (uint32_t)SQRT(800), (uint32_t)sqrt(800));
  sersendf_P(PSTR("900:  %lu  %lu\n"), (uint32_t)SQRT(900), (uint32_t)sqrt(900));
  sersendf_P(PSTR("1000:  %lu  %lu\n"), (uint32_t)SQRT(1000), (uint32_t)sqrt(1000));
  sersendf_P(PSTR("2000:  %lu  %lu\n"), (uint32_t)SQRT(2000), (uint32_t)sqrt(2000));
  sersendf_P(PSTR("3000:  %lu  %lu\n"), (uint32_t)SQRT(3000), (uint32_t)sqrt(3000));
  sersendf_P(PSTR("4000:  %lu  %lu\n"), (uint32_t)SQRT(4000), (uint32_t)sqrt(4000));
  sersendf_P(PSTR("5000:  %lu  %lu\n"), (uint32_t)SQRT(5000), (uint32_t)sqrt(5000));
  sersendf_P(PSTR("6000:  %lu  %lu\n"), (uint32_t)SQRT(6000), (uint32_t)sqrt(6000));
  sersendf_P(PSTR("7000:  %lu  %lu\n"), (uint32_t)SQRT(7000), (uint32_t)sqrt(7000));
  sersendf_P(PSTR("8000:  %lu  %lu\n"), (uint32_t)SQRT(8000), (uint32_t)sqrt(8000));
  sersendf_P(PSTR("9000:  %lu  %lu\n"), (uint32_t)SQRT(9000), (uint32_t)sqrt(9000));
  sersendf_P(PSTR("10000:  %lu  %lu\n"), (uint32_t)SQRT(10000), (uint32_t)sqrt(10000));
  sersendf_P(PSTR("20000:  %lu  %lu\n"), (uint32_t)SQRT(20000), (uint32_t)sqrt(20000));
  sersendf_P(PSTR("30000:  %lu  %lu\n"), (uint32_t)SQRT(30000), (uint32_t)sqrt(30000));
  sersendf_P(PSTR("40000:  %lu  %lu\n"), (uint32_t)SQRT(40000), (uint32_t)sqrt(40000));
  sersendf_P(PSTR("50000:  %lu  %lu\n"), (uint32_t)SQRT(50000), (uint32_t)sqrt(50000));
  sersendf_P(PSTR("60000:  %lu  %lu\n"), (uint32_t)SQRT(60000), (uint32_t)sqrt(60000));
  sersendf_P(PSTR("70000:  %lu  %lu\n"), (uint32_t)SQRT(70000), (uint32_t)sqrt(70000));
  sersendf_P(PSTR("80000:  %lu  %lu\n"), (uint32_t)SQRT(80000), (uint32_t)sqrt(80000));
  sersendf_P(PSTR("90000:  %lu  %lu\n"), (uint32_t)SQRT(90000), (uint32_t)sqrt(90000));
  sersendf_P(PSTR("100000:  %lu  %lu\n"), (uint32_t)SQRT(100000), (uint32_t)sqrt(100000));
  sersendf_P(PSTR("200000:  %lu  %lu\n"), (uint32_t)SQRT(200000), (uint32_t)sqrt(200000));
  sersendf_P(PSTR("300000:  %lu  %lu\n"), (uint32_t)SQRT(300000), (uint32_t)sqrt(300000));
  sersendf_P(PSTR("400000:  %lu  %lu\n"), (uint32_t)SQRT(400000), (uint32_t)sqrt(400000));
  sersendf_P(PSTR("500000:  %lu  %lu\n"), (uint32_t)SQRT(500000), (uint32_t)sqrt(500000));
  sersendf_P(PSTR("600000:  %lu  %lu\n"), (uint32_t)SQRT(600000), (uint32_t)sqrt(600000));
  sersendf_P(PSTR("700000:  %lu  %lu\n"), (uint32_t)SQRT(700000), (uint32_t)sqrt(700000));
  sersendf_P(PSTR("800000:  %lu  %lu\n"), (uint32_t)SQRT(800000), (uint32_t)sqrt(800000));
  sersendf_P(PSTR("900000:  %lu  %lu\n"), (uint32_t)SQRT(900000), (uint32_t)sqrt(900000));
  sersendf_P(PSTR("1000000:  %lu  %lu\n"), (uint32_t)SQRT(1000000), (uint32_t)sqrt(1000000));
  sersendf_P(PSTR("2000000:  %lu  %lu\n"), (uint32_t)SQRT(2000000), (uint32_t)sqrt(2000000));
  sersendf_P(PSTR("3000000:  %lu  %lu\n"), (uint32_t)SQRT(3000000), (uint32_t)sqrt(3000000));
  sersendf_P(PSTR("4000000:  %lu  %lu\n"), (uint32_t)SQRT(4000000), (uint32_t)sqrt(4000000));
  sersendf_P(PSTR("5000000:  %lu  %lu\n"), (uint32_t)SQRT(5000000), (uint32_t)sqrt(5000000));
  sersendf_P(PSTR("6000000:  %lu  %lu\n"), (uint32_t)SQRT(6000000), (uint32_t)sqrt(6000000));
  sersendf_P(PSTR("7000000:  %lu  %lu\n"), (uint32_t)SQRT(7000000), (uint32_t)sqrt(7000000));
  sersendf_P(PSTR("8000000:  %lu  %lu\n"), (uint32_t)SQRT(8000000), (uint32_t)sqrt(8000000));
  sersendf_P(PSTR("9000000:  %lu  %lu\n"), (uint32_t)SQRT(9000000), (uint32_t)sqrt(9000000));
  sersendf_P(PSTR("10000000:  %lu  %lu\n"), (uint32_t)SQRT(10000000), (uint32_t)sqrt(10000000));
  sersendf_P(PSTR("20000000:  %lu  %lu\n"), (uint32_t)SQRT(20000000), (uint32_t)sqrt(20000000));
  sersendf_P(PSTR("30000000:  %lu  %lu\n"), (uint32_t)SQRT(30000000), (uint32_t)sqrt(30000000));
  sersendf_P(PSTR("40000000:  %lu  %lu\n"), (uint32_t)SQRT(40000000), (uint32_t)sqrt(40000000));
  sersendf_P(PSTR("50000000:  %lu  %lu\n"), (uint32_t)SQRT(50000000), (uint32_t)sqrt(50000000));
  sersendf_P(PSTR("60000000:  %lu  %lu\n"), (uint32_t)SQRT(60000000), (uint32_t)sqrt(60000000));
  sersendf_P(PSTR("70000000:  %lu  %lu\n"), (uint32_t)SQRT(70000000), (uint32_t)sqrt(70000000));
  sersendf_P(PSTR("80000000:  %lu  %lu\n"), (uint32_t)SQRT(80000000), (uint32_t)sqrt(80000000));
  sersendf_P(PSTR("90000000:  %lu  %lu\n"), (uint32_t)SQRT(90000000), (uint32_t)sqrt(90000000));
  sersendf_P(PSTR("100000000:  %lu  %lu\n"), (uint32_t)SQRT(100000000), (uint32_t)sqrt(100000000));
  sersendf_P(PSTR("200000000:  %lu  %lu\n"), (uint32_t)SQRT(200000000), (uint32_t)sqrt(200000000));
  sersendf_P(PSTR("300000000:  %lu  %lu\n"), (uint32_t)SQRT(300000000), (uint32_t)sqrt(300000000));
  sersendf_P(PSTR("400000000:  %lu  %lu\n"), (uint32_t)SQRT(400000000), (uint32_t)sqrt(400000000));
  sersendf_P(PSTR("500000000:  %lu  %lu\n"), (uint32_t)SQRT(500000000), (uint32_t)sqrt(500000000));
  sersendf_P(PSTR("600000000:  %lu  %lu\n"), (uint32_t)SQRT(600000000), (uint32_t)sqrt(600000000));
  sersendf_P(PSTR("700000000:  %lu  %lu\n"), (uint32_t)SQRT(700000000), (uint32_t)sqrt(700000000));
  sersendf_P(PSTR("800000000:  %lu  %lu\n"), (uint32_t)SQRT(800000000), (uint32_t)sqrt(800000000));
  sersendf_P(PSTR("900000000:  %lu  %lu\n"), (uint32_t)SQRT(900000000), (uint32_t)sqrt(900000000));
  sersendf_P(PSTR("1000000000:  %lu  %lu\n"), (uint32_t)SQRT(1000000000), (uint32_t)sqrt(1000000000));
  sersendf_P(PSTR("2000000000:  %lu  %lu\n"), (uint32_t)SQRT(2000000000), (uint32_t)sqrt(2000000000));
  sersendf_P(PSTR("3000000000:  %lu  %lu\n"), (uint32_t)SQRT(3000000000), (uint32_t)sqrt(3000000000));
  sersendf_P(PSTR("4000000000:  %lu  %lu\n"), (uint32_t)SQRT(4000000000), (uint32_t)sqrt(4000000000));
---
 preprocessor_math.h | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/preprocessor_math.h b/preprocessor_math.h
index d4fe60a..f4c8dbb 100644
--- a/preprocessor_math.h
+++ b/preprocessor_math.h
@@ -18,17 +18,16 @@
   equals
   (uint32_t)(sqrt(i) + .5)
 
-  These two provide identical results up to 1'861'860 (tested at runtime) and
-  up to 10'000'000 at compile time. At 90'000'000, deviation is about 5%,
-  increasing further at even higher numbers. This "+ .5" is for rounding and
-  not crucial. Casting to other sizes is also possible.
+  These two provide identical results for all tested numbers across the
+  uint32 range. This "+ .5" is for rounding and not crucial. Casting to
+  other sizes is also possible.
 
-  Can be used for calculations at runtime, too, where it costs 944(!) bytes
-  binary size and takes roughly 10'000(!) clock cycles.
+  Can principally be used for calculations at runtime, too, but its compiled
+  size is prohibitively large (more than 20kB per instance).
 
   Initial version found on pl.comp.lang.c, posted by Jean-Louis PATANE.
 */
-#define SQR00(x) (((double)(x)) / 2)
+#define SQR00(x) (((x) > 65535) ? (double)65535 : (double)(x) / 2)
 #define SQR01(x) ((SQR00(x) + ((x) / SQR00(x))) / 2)
 #define SQR02(x) ((SQR01(x) + ((x) / SQR01(x))) / 2)
 #define SQR03(x) ((SQR02(x) + ((x) / SQR02(x))) / 2)
@@ -41,11 +40,10 @@
 #define SQR10(x) ((SQR09(x) + ((x) / SQR09(x))) / 2)
 #define SQR11(x) ((SQR10(x) + ((x) / SQR10(x))) / 2)
 #define SQR12(x) ((SQR11(x) + ((x) / SQR11(x))) / 2)
-// You can add more of these lines here, each additional line increases
-// accurate range by about factor 2 and costs additional 40 bytes binary size
-// in the non-constant case. But beware, the length of the preprocessed term
-// explodes, leading to several seconds compile time above about SQR13.
-#define SQRT(x) ((SQR12(x) + ((x) / SQR12(x))) / 2)
+// We use 9 iterations, note how SQR10() and up get ignored. You can add more
+// iterations here, but beware, the length of the preprocessed term
+// explodes, leading to several seconds compile time above about SQR10().
+#define SQRT(x) ((SQR09(x) + ((x) / SQR09(x))) / 2)
 
 
 #endif /* _PREPROCESSOR_MATH_H */