From 20a0808887074b08c4cff82b1ab2cc4bd8ced6e1 Mon Sep 17 00:00:00 2001
From: Markus Hitter <mah@jump-ing.de>
Date: Mon, 21 Nov 2016 11:56:13 +0100
Subject: [PATCH] DDA: don't count individual axis steps.

Using the Bresenham algorithm it's safe to assume that if the axis
with the most steps is done, all other axes are done, too.

This way we save a lot of variable loading in dda_step(). We also
save this very expensive comparison of all axis counters against
zero. Minor drawback: update_current_position() is now even slower.

About performance. The slowest step decreased from 719 to 604
clocks, which is quite an improvement. Average step time increased
for single axis movements by 16 clocks and decreased for multi-
axis movements. At the bottom line this should improve real-world
performance quite a bit, because a printer movement speed isn't
limited by average timings, but by the time needed for the slowest
step.

Along the way, binary size dropped by nice 244 bytes, RAM usage by
also nice 16 bytes.

  ATmega sizes               '168   '328(P)   '644(P)     '1280
  Program:  19564 bytes      137%       64%       31%       16%
     Data:   2175 bytes      213%      107%       54%       27%
   EEPROM:     32 bytes        4%        2%        2%        1%

  short-moves.gcode statistics:
  LED on occurences: 888.
  LED on time minimum: 326 clock cycles.
  LED on time maximum: 595 clock cycles.
  LED on time average: 333.62 clock cycles.

  smooth-curves.gcode statistics:
  LED on occurences: 23648.
  LED on time minimum: 318 clock cycles.
  LED on time maximum: 604 clock cycles.
  LED on time average: 333.311 clock cycles.

  triangle-odd.gcode statistics:
  LED on occurences: 1636.
  LED on time minimum: 318 clock cycles.
  LED on time maximum: 585 clock cycles.
  LED on time average: 335.233 clock cycles.
---
 dda.c | 85 ++++++++++++++++++++++++++++-------------------------------
 dda.h |  9 +++----
 2 files changed, 44 insertions(+), 50 deletions(-)

diff --git a/dda.c b/dda.c
index a9d3646..7824493 100644
--- a/dda.c
+++ b/dda.c
@@ -220,7 +220,7 @@ void dda_create(DDA *dda, const TARGET *target) {
 
   // Apply extrusion multiplier.
   if (target->e_multiplier != 256) {
-  steps[E] *= target->e_multiplier;
+    steps[E] *= target->e_multiplier;
     steps[E] += 128;
     steps[E] /= 256;
   }
@@ -522,12 +522,12 @@ void dda_start(DDA *dda) {
 		// initialise state variable
     move_state.counter[X] = move_state.counter[Y] = move_state.counter[Z] = \
       move_state.counter[E] = -(dda->total_steps >> 1);
-    memcpy(&move_state.steps[X], &dda->delta[X], sizeof(uint32_t) * 4);
     move_state.endstop_stop = 0;
 		#ifdef ACCELERATION_RAMPING
 			move_state.step_no = 0;
 		#endif
 		#ifdef ACCELERATION_TEMPORAL
+      memcpy(&move_state.steps[X], &dda->delta[X], sizeof(uint32_t) * 4);
       move_state.time[X] = move_state.time[Y] = \
         move_state.time[Z] = move_state.time[E] = 0UL;
 		#endif
@@ -565,40 +565,29 @@ void dda_start(DDA *dda) {
 */
 void dda_step(DDA *dda) {
 
-#if ! defined ACCELERATION_TEMPORAL
-  if (move_state.steps[X]) {
+  #if ! defined ACCELERATION_TEMPORAL
     move_state.counter[X] -= dda->delta[X];
     if (move_state.counter[X] < 0) {
 			x_step();
-      move_state.steps[X]--;
       move_state.counter[X] += dda->total_steps;
 		}
-	}
-  if (move_state.steps[Y]) {
     move_state.counter[Y] -= dda->delta[Y];
     if (move_state.counter[Y] < 0) {
 			y_step();
-      move_state.steps[Y]--;
       move_state.counter[Y] += dda->total_steps;
 		}
-	}
-  if (move_state.steps[Z]) {
     move_state.counter[Z] -= dda->delta[Z];
     if (move_state.counter[Z] < 0) {
 			z_step();
-      move_state.steps[Z]--;
       move_state.counter[Z] += dda->total_steps;
 		}
-	}
-  if (move_state.steps[E]) {
     move_state.counter[E] -= dda->delta[E];
     if (move_state.counter[E] < 0) {
 			e_step();
-      move_state.steps[E]--;
       move_state.counter[E] += dda->total_steps;
 		}
-	}
-#endif
+    move_state.step_no++;
+  #endif
 
 	#ifdef ACCELERATION_REPRAP
 		// linear acceleration magic, courtesy of http://www.embedded.com/design/mcus-processors-and-socs/4006438/Generate-stepper-motor-speed-profiles-in-real-time
@@ -628,10 +617,6 @@ void dda_step(DDA *dda) {
 		}
 	#endif
 
-	#ifdef ACCELERATION_RAMPING
-		move_state.step_no++;
-	#endif
-
   #ifdef ACCELERATION_TEMPORAL
     /** How is this ACCELERATION TEMPORAL expected to work?
 
@@ -710,12 +695,14 @@ void dda_step(DDA *dda) {
   //
   // TODO: with ACCELERATION_TEMPORAL this duplicates some code. See where
   //       dda->live is zero'd, about 10 lines above.
-  if ((move_state.steps[X] == 0 && move_state.steps[Y] == 0 &&
-       move_state.steps[Z] == 0 && move_state.steps[E] == 0)
-    #ifdef ACCELERATION_RAMPING
-      || (move_state.endstop_stop && dda->n <= 0)
-    #endif
-      ) {
+  #if ! defined ACCELERATION_TEMPORAL
+    if (move_state.step_no >= dda->total_steps ||
+        (move_state.endstop_stop && dda->n <= 0))
+  #else
+    if (move_state.steps[X] == 0 && move_state.steps[Y] == 0 &&
+        move_state.steps[Z] == 0 && move_state.steps[E] == 0)
+  #endif
+  {
 		dda->live = 0;
     dda->done = 1;
     #ifdef LOOKAHEAD
@@ -958,26 +945,36 @@ void update_current_position() {
     ((STEPS_PER_M_E + 500) / 1000)
   };
 
-	if (queue_empty()) {
+  if (dda->live) {
+    uint32_t axis_steps;
+
+    for (i = X; i < AXIS_COUNT; i++) {
+      #if ! defined ACCELERATION_TEMPORAL
+        axis_steps = muldiv(dda->total_steps - move_state.step_no,
+                            dda->delta[i], dda->total_steps);
+      #else
+        axis_steps = move_state.steps[i];
+      #endif
+      current_position.axis[i] =
+        dda->endpoint.axis[i] - (int32_t)get_direction(dda, i) *
+        // Should be: move_state.steps[i] * 1000000 / steps_per_m_P[i])
+        // but steps[i] can be like 1000000 already, so we'd overflow.
+        // Unfortunately, using muldiv() overwhelms the compiler.
+        // Also keep the parens around this term, else results go wrong.
+        ((axis_steps * 1000) / pgm_read_dword(&steps_per_mm_P[i]));
+    }
+
+    if (dda->endpoint.e_relative) {
+      // We support only one extruder, so axis_steps is already valid.
+      current_position.axis[E] =
+          (axis_steps * 1000) / pgm_read_dword(&steps_per_mm_P[E]);
+    }
+
+		// current_position.F is updated in dda_start()
+	}
+  else {
     for (i = X; i < AXIS_COUNT; i++) {
       current_position.axis[i] = startpoint.axis[i];
     }
 	}
-	else if (dda->live) {
-    for (i = X; i < AXIS_COUNT; i++) {
-      current_position.axis[i] = dda->endpoint.axis[i] -
-          (int32_t)get_direction(dda, i) *
-          // Should be: move_state.steps[i] * 1000000 / steps_per_m_P[i])
-          // but steps[i] can be like 1000000 already, so we'd overflow.
-          // Unfortunately, using muldiv() overwhelms the compiler.
-          // Also keep the parens around this term, else results go wrong.
-          ((move_state.steps[i] * 1000) / pgm_read_dword(&steps_per_mm_P[i]));
-    }
-
-    if (dda->endpoint.e_relative)
-      current_position.axis[E] =
-          (move_state.steps[E] * 1000) / pgm_read_dword(&steps_per_mm_P[E]);
-
-		// current_position.F is updated in dda_start()
-	}
 }
diff --git a/dda.h b/dda.h
index 18f2fe8..0aa853f 100644
--- a/dda.h
+++ b/dda.h
@@ -56,14 +56,11 @@ typedef struct {
 	// bresenham counters
   axes_int32_t      counter; ///< counter for total_steps vs each axis
 
-	// step counters
-  axes_uint32_t     steps;   ///< number of steps on each axis
-
-	#ifdef ACCELERATION_RAMPING
+  #if ! defined ACCELERATION_TEMPORAL
 	/// counts actual steps done
 	uint32_t					step_no;
-	#endif
-	#ifdef ACCELERATION_TEMPORAL
+	#else
+    axes_uint32_t   steps;      ///< number of steps on each axis
   axes_uint32_t     time;       ///< time of the last step on each axis
   uint32_t          last_time;  ///< time of the last step of any axis
 	#endif