diff --git a/mendel/clock.h b/mendel/clock.h
index 67946d1..b8eabf0 100644
--- a/mendel/clock.h
+++ b/mendel/clock.h
@@ -3,7 +3,7 @@
 
 #include	<stdint.h>
 
-void			clock_setup(void);
+void			clock_setup(void) __attribute__ ((cold));
 
 #ifdef	GLOBAL_CLOCK
 uint32_t	clock_read(void);
diff --git a/mendel/dda.c b/mendel/dda.c
index feaa636..a5056fe 100644
--- a/mendel/dda.c
+++ b/mendel/dda.c
@@ -106,6 +106,19 @@ uint32_t delta32(uint32_t v1, uint32_t v2) {
 	return v2 - v1;
 }
 
+// this is an ultra-crude pseudo-logarithm routine, such that:
+// 2 ^ msbloc(v) >= v
+const uint8_t	msbloc (uint32_t v) {
+	uint8_t i;
+	uint32_t c;
+	for (i = 31, c = 0x80000000; i; i--) {
+		if (v & c)
+			return i;
+		v >>= 1;
+	}
+	return 0;
+}
+
 /*
 	CREATE a dda given current_position and a target, save to passed location so we can write directly into the queue
 */
@@ -114,7 +127,7 @@ void dda_create(DDA *dda, TARGET *target) {
 	uint32_t	distance;
 
 	// initialise DDA to a known state
-	dda->move_duration = 0;
+// 	dda->move_duration = 0;
 	dda->live = 0;
 	dda->total_steps = 0;
 
@@ -128,13 +141,13 @@ void dda_create(DDA *dda, TARGET *target) {
 	dda->y_delta = abs32(target->Y - startpoint.Y);
 	dda->z_delta = abs32(target->Z - startpoint.Z);
 	dda->e_delta = abs32(target->E - startpoint.E);
-	dda->f_delta = delta32(target->F, startpoint.F);
+// 	dda->f_delta = delta32(target->F, startpoint.F);
 
 	dda->x_direction = (target->X >= startpoint.X)?1:0;
 	dda->y_direction = (target->Y >= startpoint.Y)?1:0;
 	dda->z_direction = (target->Z >= startpoint.Z)?1:0;
 	dda->e_direction = (target->E >= startpoint.E)?1:0;
-	dda->f_direction = (target->F >= startpoint.F)?1:0;
+// 	dda->f_direction = (target->F >= startpoint.F)?1:0;
 
 	if (DEBUG) {
 		if (dda->x_direction == 0)
@@ -149,9 +162,10 @@ void dda_create(DDA *dda, TARGET *target) {
 		if (dda->e_direction == 0)
 			serial_writechar('-');
 		serwrite_uint32(dda->e_delta); serial_writechar(',');
-		if (dda->f_direction == 0)
-			serial_writechar('-');
-		serwrite_uint32(dda->f_delta); serial_writestr_P(PSTR("] ["));
+// 		if (dda->f_direction == 0)
+// 			serial_writechar('-');
+// 		serwrite_uint32(dda->f_delta);
+		serial_writestr_P(PSTR("] ["));
 	}
 
 	if (dda->x_delta > dda->total_steps)
@@ -202,10 +216,55 @@ void dda_create(DDA *dda, TARGET *target) {
 		// break this calculation up a bit and lose some precision because 300,000um * 60000 is too big for a uint32
 		// calculate this with a uint64 if you need the precision, but it'll take longer so routines with lots of short moves may suffer
 		// 2^32/6000 is about 715mm which should be plenty
-		dda->move_duration = ((distance * 6000) / dda->total_steps) * 10;
 
-		if (DEBUG)
-			serwrite_uint32(dda->move_duration);
+		// changed * 10 to * (F_CPU / 100000) so we can work in cpu_ticks rather than microseconds.
+		// timer.c setTimer() routine altered for same reason
+		uint32_t move_duration = ((distance * 6000) / dda->total_steps) * (F_CPU / 100000);
+
+		// c is initial step time in IOclk ticks
+		dda->c = move_duration / startpoint.F;
+
+		if (startpoint.F != target->F) {
+			// now some linear acceleration stuff, courtesy of http://www.embedded.com/columns/technicalinsights/56800129?printable=true
+			uint32_t ssq = startpoint.F * startpoint.F;
+			uint32_t esq = target->F * target->F;
+			uint32_t dsq = esq - ssq;
+
+			dda->end_c = move_duration / target->F;
+			// the raw equation WILL overflow at high step rates, but 64 bit math routines take waay too much space
+			// at 65536 mm/min (1092mm/s), ssq/esq overflows, and dsq is also close to overflowing if esq/ssq is small
+			// but if ssq-esq is small, ssq/dsq is only a few bits
+			// we'll have to do it a few different ways depending on the msb location in each
+			if ((msbloc(dda->total_steps) + msbloc(ssq)) < 28) {
+				// we have room to do all the multiplies first
+				dda->n = ((dda->total_steps * ssq * 4) / dsq) + 1;
+			}
+// 			else
+// 			if ((msbloc(dda->total_steps) + msbloc(ssq)) < 30) {
+// 				// we have room to do the main multiply first
+// 				dda->n = (((dda->total_steps * ssq) / dsq) << 2) | 1;
+// 			}
+			else if (msbloc(dda->total_steps) > msbloc(ssq)) {
+				// total steps has more precision
+				if (msbloc(dda->total_steps) < 28)
+					dda->n = (((dda->total_steps << 2) / dsq) * ssq) + 1;
+				else
+					dda->n = (((dda->total_steps / dsq) * ssq) << 2) | 1;
+			}
+			else {
+				// otherwise
+				if (msbloc(ssq) < 28)
+					dda->n = (((ssq << 2) / dsq) * dda->total_steps) + 1;
+				else
+					dda->n = (((ssq / dsq) * dda->total_steps) << 2) | 1;
+			}
+// 			if (DEBUG)
+// 				serwrite_uint32(dda->move_duration);
+
+			dda->accel = 1;
+		}
+		else
+			dda->accel = 0;
 	}
 
 	if (DEBUG)
@@ -241,7 +300,7 @@ void dda_start(DDA *dda) {
 	dda->live = 1;
 
 	// set timeout for first step
-	setTimer(dda->move_duration / current_position.F);
+	setTimer(dda->c);
 }
 
 /*
@@ -361,27 +420,43 @@ void dda_step(DDA *dda) {
 	sei();
 	#endif
 
-	if (step_option & F_CAN_STEP) {
-		dda->f_counter -= dda->f_delta;
-		// since we don't allow total_steps to be defined by F, we may need to step multiple times if f_delta is greater than total_steps
-		// loops in interrupt context are a bad idea, but this is the best way to do this that I've come up with so far
-		while (dda->f_counter < 0) {
+// 	if (step_option & F_CAN_STEP) {
+// 		dda->f_counter -= dda->f_delta;
+// 		// since we don't allow total_steps to be defined by F, we may need to step multiple times if f_delta is greater than total_steps
+// 		// loops in interrupt context are a bad idea, but this is the best way to do this that I've come up with so far
+// 		while (dda->f_counter < 0) {
+//
+// 			dda->f_counter += dda->total_steps;
+//
+// 			if (dda->f_direction) {
+// 				current_position.F += 1;
+// 				if (current_position.F > dda->endpoint.F)
+// 					current_position.F = dda->endpoint.F;
+// 			}
+// 			else {
+// 				current_position.F -= 1;
+// 				if (current_position.F < dda->endpoint.F)
+// 					current_position.F = dda->endpoint.F;
+// 			}
+//
+// 			step_option |= F_REAL_STEP;
+// 		}
+// 	}
 
-			dda->f_counter += dda->total_steps;
-
-			if (dda->f_direction) {
-				current_position.F += 1;
-				if (current_position.F > dda->endpoint.F)
-					current_position.F = dda->endpoint.F;
-			}
-			else {
-				current_position.F -= 1;
-				if (current_position.F < dda->endpoint.F)
-					current_position.F = dda->endpoint.F;
-			}
-
-			step_option |= F_REAL_STEP;
+	if (dda->accel) {
+		if (
+				((dda->n > 0) && (dda->c > dda->end_c)) ||
+				((dda->n < 0) && (dda->c < dda->end_c))
+			 ) {
+			dda->c = dda->c - ((dda->c * 2) / dda->n);
+			dda->n += 4;
+			setTimer(dda->c);
 		}
+		else if (dda->c != dda->end_c) {
+			dda->c = dda->end_c;
+			setTimer(dda->c);
+		}
+		// else we are already at target speed
 	}
 
 	if (step_option & REAL_MOVE)
@@ -393,13 +468,15 @@ void dda_step(DDA *dda) {
 	// we simply don't have the memory to precalculate this for each step,
 	// can't use a simplified process because the denominator changes rather than the numerator so the curve is non-linear
 	// and don't have a process framework to force it to be done outside interrupt context within a usable period of time
-	if (step_option & F_REAL_STEP)
-		setTimer(dda->move_duration / current_position.F);
+// 	if (step_option & F_REAL_STEP)
+// 		setTimer(dda->move_duration / current_position.F);
 
 	// if we could do anything at all, we're still running
 	// otherwise, must have finished
-	else if (step_option == 0)
+	else if (step_option == 0) {
 		dda->live = 0;
+		current_position.F = dda->endpoint.F;
+	}
 
 	// turn off step outputs, hopefully they've been on long enough by now to register with the drivers
 	// if not, too bad. or insert a (very!) small delay here, or fire up a spare timer or something
diff --git a/mendel/dda.h b/mendel/dda.h
index 8a3679a..0b74cf1 100644
--- a/mendel/dda.h
+++ b/mendel/dda.h
@@ -28,8 +28,10 @@ typedef struct {
 	uint8_t						z_direction		:1;
 	uint8_t						e_direction		:1;
 	uint8_t						f_direction		:1;
+
 	uint8_t						nullmove			:1;
 	uint8_t						live					:1;
+	uint8_t						accel					:1;
 
 	uint32_t					x_delta;
 	uint32_t					y_delta;
@@ -44,8 +46,12 @@ typedef struct {
 	int32_t						f_counter;
 
 	uint32_t					total_steps;
+// 	uint32_t					move_duration;
 
-	uint32_t					move_duration;
+	// for linear acceleration
+	uint32_t					c;
+	uint32_t					end_c;
+	int32_t						n;
 } DDA;
 
 /*
@@ -65,11 +71,12 @@ extern TARGET current_position;
 	methods
 */
 
-uint32_t approx_distance( uint32_t dx, uint32_t dy );
-uint32_t approx_distance_3( uint32_t dx, uint32_t dy, uint32_t dz );
+uint32_t approx_distance( uint32_t dx, uint32_t dy )								__attribute__ ((hot));
+uint32_t approx_distance_3( uint32_t dx, uint32_t dy, uint32_t dz )	__attribute__ ((hot));
+const uint8_t	msbloc (uint32_t v)																		__attribute__ ((const));
 
 void dda_create(DDA *dda, TARGET *target);
-void dda_start(DDA *dda);
-void dda_step(DDA *dda);
+void dda_start(DDA *dda)																						__attribute__ ((hot));
+void dda_step(DDA *dda)																							__attribute__ ((hot));
 
 #endif	/* _DDA_H */
diff --git a/mendel/dda_queue.h b/mendel/dda_queue.h
index c3176b0..2189a27 100644
--- a/mendel/dda_queue.h
+++ b/mendel/dda_queue.h
@@ -24,7 +24,7 @@ uint8_t queue_empty(void);
 void enqueue(TARGET *t);
 
 // called from step timer when current move is complete
-void next_move(void);
+void next_move(void) __attribute__ ((hot));
 
 // print queue status
 void print_queue(void);
diff --git a/mendel/gcode.c b/mendel/gcode.c
index 4695ba3..b31a33b 100644
--- a/mendel/gcode.c
+++ b/mendel/gcode.c
@@ -427,7 +427,10 @@ void process_gcode_command(GCODE_COMMAND *gcmd) {
 
 			//	G92 - set home
 			case 92:
-				startpoint.X = startpoint.Y = startpoint.Z = startpoint.E = 0;
+				startpoint.X = startpoint.Y = startpoint.Z = startpoint.E =
+				current_position.X = current_position.Y = current_position.Z = current_position.E = 0;
+				startpoint.F =
+				current_position.F = FEEDRATE_SLOW_Z;
 				break;
 
 			// unknown gcode: spit an error
diff --git a/mendel/timer.c b/mendel/timer.c
index e85985b..0952096 100644
--- a/mendel/timer.c
+++ b/mendel/timer.c
@@ -126,7 +126,7 @@ void setTimer(uint32_t delay)
 	// Actual ticks are 0.0625 us, so multiply delay by 16
 
 	// convert to ticks
-	delay = delay US;
+// 	delay = delay US;
 
 	setTimerCeiling(getTimerCeiling(delay));
 	setTimerResolution(getTimerResolution(delay));
diff --git a/mendel/timer.h b/mendel/timer.h
index a85ee1d..67773a8 100644
--- a/mendel/timer.h
+++ b/mendel/timer.h
@@ -11,7 +11,7 @@
 // #define	DEFAULT_TICK	(100 US)
 #define	WAITING_DELAY	(10 MS)
 
-void setupTimerInterrupt(void);
+void setupTimerInterrupt(void) __attribute__ ((cold));
 
 uint8_t getTimerResolution(const uint32_t delay);
 void setTimerResolution(uint8_t r);
diff --git a/mendel/watchdog.c b/mendel/watchdog.c
index e0ce7e5..a4332e9 100644
--- a/mendel/watchdog.c
+++ b/mendel/watchdog.c
@@ -9,9 +9,7 @@
 volatile uint8_t	wd_flag = 0;
 
 // uint8_t mcusr_mirror __attribute__ ((section (".noinit")));
-// void get_mcusr(void) \
-// 	__attribute__((naked)) \
-// 	__attribute__((section(".init3")));
+// void get_mcusr(void) __attribute__((naked)) __attribute__((section(".init3")));
 // void get_mcusr(void) {
 // 	mcusr_mirror = MCUSR;
 // 	MCUSR = 0;
diff --git a/mendel/watchdog.h b/mendel/watchdog.h
index afd0d83..6a7ed4d 100644
--- a/mendel/watchdog.h
+++ b/mendel/watchdog.h
@@ -2,7 +2,7 @@
 #define	_WATCHDOG_H
 
 // initialize
-void wd_init(void);
+void wd_init(void) __attribute__ ((cold));
 
 // reset timeout- must be called periodically or we reboot
 void wd_reset(void);