First commit.

2020-07-17 18:55:06 +02:00
commit b47a0ab935
247 changed files with 30192 additions and 0 deletions
@@ -0,0 +1,467 @@
+#ifndef __INC_CLOCKLESS_TRINKET_H
+#define __INC_CLOCKLESS_TRINKET_H
+
+#include "../../controller.h"
+#include "../../lib8tion.h"
+#include <avr/interrupt.h> // for cli/se definitions
+
+FASTLED_NAMESPACE_BEGIN
+
+#if defined(FASTLED_AVR)
+
+// Scaling macro choice
+#ifndef TRINKET_SCALE
+#define TRINKET_SCALE 1
+// whether or not to use dithering
+#define DITHER 1
+#endif
+
+#if (F_CPU==8000000)
+#define FASTLED_SLOW_CLOCK_ADJUST // asm __volatile__ ("mov r0,r0\n\t");
+#else
+#define FASTLED_SLOW_CLOCK_ADJUST
+#endif
+
+#define US_PER_TICK (64 / (F_CPU/1000000))
+
+// Variations on the functions in delay.h - w/a loop var passed in to preserve registers across calls by the optimizer/compiler
+template<int CYCLES> inline void _dc(register uint8_t & loopvar);
+
+template<int _LOOP, int PAD> __attribute__((always_inline)) inline void _dc_AVR(register uint8_t & loopvar) {
+	_dc<PAD>(loopvar);
+	// The convolution in here is to ensure that the state of the carry flag coming into the delay loop is preserved
+	asm __volatile__ (  "BRCS L_PC%=\n\t"
+						"        LDI %[loopvar], %[_LOOP]\n\tL_%=: DEC %[loopvar]\n\t BRNE L_%=\n\tBREQ L_DONE%=\n\t"
+						"L_PC%=: LDI %[loopvar], %[_LOOP]\n\tLL_%=: DEC %[loopvar]\n\t BRNE LL_%=\n\tBSET 0\n\t"
+						"L_DONE%=:\n\t"
+						:
+							[loopvar] "+a" (loopvar) : [_LOOP] "M" (_LOOP) : );
+}
+
+template<int CYCLES> __attribute__((always_inline)) inline void _dc(register uint8_t & loopvar) {
+	_dc_AVR<CYCLES/6,CYCLES%6>(loopvar);
+}
+template<> __attribute__((always_inline)) inline void _dc<-6>(register uint8_t & ) {}
+template<> __attribute__((always_inline)) inline void _dc<-5>(register uint8_t & ) {}
+template<> __attribute__((always_inline)) inline void _dc<-4>(register uint8_t & ) {}
+template<> __attribute__((always_inline)) inline void _dc<-3>(register uint8_t & ) {}
+template<> __attribute__((always_inline)) inline void _dc<-2>(register uint8_t & ) {}
+template<> __attribute__((always_inline)) inline void _dc<-1>(register uint8_t & ) {}
+template<> __attribute__((always_inline)) inline void _dc< 0>(register uint8_t & ) {}
+template<> __attribute__((always_inline)) inline void _dc< 1>(register uint8_t & ) {asm __volatile__("mov r0,r0":::);}
+template<> __attribute__((always_inline)) inline void _dc< 2>(register uint8_t & ) {asm __volatile__("rjmp .+0":::);}
+template<> __attribute__((always_inline)) inline void _dc< 3>(register uint8_t & loopvar) { _dc<2>(loopvar); _dc<1>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc< 4>(register uint8_t & loopvar) { _dc<2>(loopvar); _dc<2>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc< 5>(register uint8_t & loopvar) { _dc<2>(loopvar); _dc<3>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc< 6>(register uint8_t & loopvar) { _dc<2>(loopvar); _dc<2>(loopvar); _dc<2>(loopvar);}
+template<> __attribute__((always_inline)) inline void _dc< 7>(register uint8_t & loopvar) { _dc<4>(loopvar); _dc<3>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc< 8>(register uint8_t & loopvar) { _dc<4>(loopvar); _dc<4>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc< 9>(register uint8_t & loopvar) { _dc<5>(loopvar); _dc<4>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<10>(register uint8_t & loopvar) { _dc<6>(loopvar); _dc<4>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<11>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<1>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<12>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<2>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<13>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<3>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<14>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<4>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<15>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<5>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<16>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<6>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<17>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<7>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<18>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<8>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<19>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<9>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<20>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<10>(loopvar); }
+
+#define DINTPIN(T,ADJ,PINADJ) (T-(PINADJ+ADJ)>0) ? _dc<T-(PINADJ+ADJ)>(loopvar) : _dc<0>(loopvar);
+#define DINT(T,ADJ) if(AVR_PIN_CYCLES(DATA_PIN)==1) { DINTPIN(T,ADJ,1) } else { DINTPIN(T,ADJ,2); }
+#define _D1(ADJ) DINT(T1,ADJ)
+#define _D2(ADJ) DINT(T2,ADJ)
+#define _D3(ADJ) DINT(T3,ADJ)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Base template for clockless controllers.  These controllers have 3 control points in their cycle for each bit.  The first point
+// is where the line is raised hi.  The second point is where the line is dropped low for a zero.  The third point is where the
+// line is dropped low for a one.  T1, T2, and T3 correspond to the timings for those three in clock cycles.
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if (!defined(NO_CORRECTION) || (NO_CORRECTION == 0)) && (FASTLED_ALLOW_INTERRUPTS == 0)
+static uint8_t gTimeErrorAccum256ths;
+#endif
+
+#define FASTLED_HAS_CLOCKLESS 1
+
+template <uint8_t DATA_PIN, int T1, int T2, int T3, EOrder RGB_ORDER = RGB, int XTRA0 = 0, bool FLIP = false, int WAIT_TIME = 10>
+class ClocklessController : public CPixelLEDController<RGB_ORDER> {
+	static_assert(T1 >= 2 && T2 >= 2 && T3 >= 3, "Not enough cycles - use a higher clock speed");
+
+	typedef typename FastPin<DATA_PIN>::port_ptr_t data_ptr_t;
+	typedef typename FastPin<DATA_PIN>::port_t data_t;
+
+	CMinWait<WAIT_TIME> mWait;
+public:
+	virtual void init() {
+		FastPin<DATA_PIN>::setOutput();
+	}
+
+	virtual uint16_t getMaxRefreshRate() const { return 400; }
+
+protected:
+
+	virtual void showPixels(PixelController<RGB_ORDER> & pixels) {
+
+		mWait.wait();
+		cli();
+
+		showRGBInternal(pixels);
+
+		// Adjust the timer
+#if (!defined(NO_CORRECTION) || (NO_CORRECTION == 0)) && (FASTLED_ALLOW_INTERRUPTS == 0)
+        uint32_t microsTaken = (uint32_t)pixels.size() * (uint32_t)CLKS_TO_MICROS(24 * (T1 + T2 + T3));
+
+        // adust for approximate observed actal runtime (as of January 2015)
+        // roughly 9.6 cycles per pixel, which is 0.6us/pixel at 16MHz
+        // microsTaken += nLeds * 0.6 * CLKS_TO_MICROS(16);
+        microsTaken += scale16by8(pixels.size(),(0.6 * 256) + 1) * CLKS_TO_MICROS(16);
+
+        // if less than 1000us, there is NO timer impact,
+        // this is because the ONE interrupt that might come in while interrupts
+        // are disabled is queued up, and it will be serviced as soon as
+        // interrupts are re-enabled.
+        // This actually should technically also account for the runtime of the
+        // interrupt handler itself, but we're just not going to worry about that.
+        if( microsTaken > 1000) {
+
+            // Since up to one timer tick will be queued, we don't need
+            // to adjust the MS_COUNTER for that one.
+            microsTaken -= 1000;
+
+            // Now convert microseconds to 256ths of a second, approximately like this:
+            // 250ths = (us/4)
+            // 256ths = 250ths * (263/256);
+            uint16_t x256ths = microsTaken >> 2;
+            x256ths += scale16by8(x256ths,7);
+
+            x256ths += gTimeErrorAccum256ths;
+            MS_COUNTER += (x256ths >> 8);
+            gTimeErrorAccum256ths = x256ths & 0xFF;
+        }
+
+#if 0
+        // For pixel counts of 30 and under at 16Mhz, no correction is necessary.
+        // For pixel counts of 15 and under at 8Mhz, no correction is necessary.
+        //
+        // This code, below, is smaller, and quicker clock correction, which drifts much
+        // more significantly, but is a few bytes smaller.  Presented here for consideration
+        // as an alternate on the ATtiny, which can't have more than about 150 pixels MAX
+        // anyway, meaning that microsTaken will never be more than about 4,500, which fits in
+        // a 16-bit variable.  The difference between /1000 and /1024 only starts showing
+        // up in the range of about 100 pixels, so many ATtiny projects won't even
+        // see a clock difference due to the approximation there.
+		uint16_t microsTaken = (uint32_t)nLeds * (uint32_t)CLKS_TO_MICROS((24) * (T1 + T2 + T3));
+        MS_COUNTER += (microsTaken >> 10);
+#endif
+
+#endif
+
+		sei();
+		mWait.mark();
+	}
+#define USE_ASM_MACROS
+
+// The variables that our various asm statemetns use.  The same block of variables needs to be declared for
+// all the asm blocks because GCC is pretty stupid and it would clobber variables happily or optimize code away too aggressively
+#define ASM_VARS : /* write variables */				\
+				[count] "+x" (count),					\
+				[data] "+z" (data),						\
+				[b1] "+a" (b1),							\
+				[d0] "+r" (d0),							\
+				[d1] "+r" (d1),							\
+				[d2] "+r" (d2),							\
+				[loopvar] "+a" (loopvar),				\
+				[scale_base] "+a" (scale_base)			\
+				: /* use variables */					\
+				[ADV] "r" (advanceBy),					\
+				[b0] "a" (b0),							\
+				[hi] "r" (hi),							\
+				[lo] "r" (lo),							\
+				[s0] "r" (s0),					  		\
+				[s1] "r" (s1),							\
+				[s2] "r" (s2),							\
+				[e0] "r" (e0),							\
+				[e1] "r" (e1),							\
+				[e2] "r" (e2),							\
+				[PORT] "M" (FastPin<DATA_PIN>::port()-0x20),		\
+				[O0] "M" (RGB_BYTE0(RGB_ORDER)),		\
+				[O1] "M" (RGB_BYTE1(RGB_ORDER)),		\
+				[O2] "M" (RGB_BYTE2(RGB_ORDER))		\
+				: "cc" /* clobber registers */
+
+
+// Note: the code in the else in HI1/LO1 will be turned into an sts (2 cycle, 2 word) opcode
+// 1 cycle, write hi to the port
+#define HI1 FASTLED_SLOW_CLOCK_ADJUST if((int)(FastPin<DATA_PIN>::port())-0x20 < 64) { asm __volatile__("out %[PORT], %[hi]" ASM_VARS ); } else { *FastPin<DATA_PIN>::port()=hi; }
+// 1 cycle, write lo to the port
+#define LO1 if((int)(FastPin<DATA_PIN>::port())-0x20 < 64) { asm __volatile__("out %[PORT], %[lo]" ASM_VARS ); } else { *FastPin<DATA_PIN>::port()=lo; }
+
+// 2 cycles, sbrs on flipping the line to lo if we're pushing out a 0
+#define QLO2(B, N) asm __volatile__("sbrs %[" #B "], " #N ASM_VARS ); LO1;
+// load a byte from ram into the given var with the given offset
+#define LD2(B,O) asm __volatile__("ldd %[" #B "], Z + %[" #O "]\n\t" ASM_VARS );
+// 4 cycles - load a byte from ram into the scaling scratch space with the given offset, clear the target var, clear carry
+#define LDSCL4(B,O) asm __volatile__("ldd %[scale_base], Z + %[" #O "]\n\tclr %[" #B "]\n\tclc\n\t" ASM_VARS );
+
+#if (DITHER==1)
+// apply dithering value  before we do anything with scale_base
+#define PRESCALE4(D) asm __volatile__("cpse %[scale_base], __zero_reg__\n\t add %[scale_base],%[" #D "]\n\tbrcc L_%=\n\tldi %[scale_base], 0xFF\n\tL_%=:\n\t" ASM_VARS);
+
+// Do the add for the prescale
+#define PRESCALEA2(D) asm __volatile__("cpse %[scale_base], __zero_reg__\n\t add %[scale_base],%[" #D "]\n\t" ASM_VARS);
+
+// Do the clamp for the prescale, clear carry when we're done - NOTE: Must ensure carry flag state is preserved!
+#define PRESCALEB4(D) asm __volatile__("brcc L_%=\n\tldi %[scale_base], 0xFF\n\tL_%=:\n\tneg %[" #D "]\n\tCLC" ASM_VARS);
+
+// Clamp for prescale, increment data, since we won't ever wrap 65k, this also effectively clears carry for us
+#define PSBIDATA4(D) asm __volatile__("brcc L_%=\n\tldi %[scale_base], 0xFF\n\tL_%=:\n\tadd %A[data], %[ADV]\n\tadc %B[data], __zero_reg__\n\t" ASM_VARS);
+
+#else
+#define PRESCALE4(D) _dc<4>(loopvar);
+#define PRESCALEA2(D) _dc<2>(loopvar);
+#define PRESCALEB4(D) _dc<4>(loopvar);
+#define PSBIDATA4(D) asm __volatile__( "add %A[data], %[ADV]\n\tadc %B[data], __zero_reg__\n\trjmp .+0\n\t" ASM_VARS );
+#endif
+
+// 2 cycles - perform one step of the scaling (if a given bit is set in scale, add scale-base to the scratch space)
+#define _SCALE02(B, N) "sbrc %[s0], " #N "\n\tadd %[" #B "], %[scale_base]\n\t"
+#define _SCALE12(B, N) "sbrc %[s1], " #N "\n\tadd %[" #B "], %[scale_base]\n\t"
+#define _SCALE22(B, N) "sbrc %[s2], " #N "\n\tadd %[" #B "], %[scale_base]\n\t"
+#define SCALE02(B,N) asm __volatile__( _SCALE02(B,N) ASM_VARS );
+#define SCALE12(B,N) asm __volatile__( _SCALE12(B,N) ASM_VARS );
+#define SCALE22(B,N) asm __volatile__( _SCALE22(B,N) ASM_VARS );
+
+// 1 cycle - rotate right, pulling in from carry
+#define _ROR1(B) "ror %[" #B "]\n\t"
+#define ROR1(B) asm __volatile__( _ROR1(B) ASM_VARS);
+
+// 1 cycle, clear the carry bit
+#define _CLC1 "clc\n\t"
+#define CLC1 asm __volatile__( _CLC1 ASM_VARS );
+
+// 2 cycles, rortate right, pulling in from carry then clear the carry bit
+#define RORCLC2(B) asm __volatile__( _ROR1(B) _CLC1 ASM_VARS );
+
+// 4 cycles, rotate, clear carry, scale next bit
+#define RORSC04(B, N) asm __volatile__( _ROR1(B) _CLC1 _SCALE02(B, N) ASM_VARS );
+#define RORSC14(B, N) asm __volatile__( _ROR1(B) _CLC1 _SCALE12(B, N) ASM_VARS );
+#define RORSC24(B, N) asm __volatile__( _ROR1(B) _CLC1 _SCALE22(B, N) ASM_VARS );
+
+// 4 cycles, scale bit, rotate, clear carry
+#define SCROR04(B, N) asm __volatile__( _SCALE02(B,N) _ROR1(B) _CLC1 ASM_VARS );
+#define SCROR14(B, N) asm __volatile__( _SCALE12(B,N) _ROR1(B) _CLC1 ASM_VARS );
+#define SCROR24(B, N) asm __volatile__( _SCALE22(B,N) _ROR1(B) _CLC1 ASM_VARS );
+
+/////////////////////////////////////////////////////////////////////////////////////
+// Loop life cycle
+
+// dither adjustment macro - should be kept in sync w/what's in stepDithering
+// #define ADJDITHER2(D, E) D = E - D;
+#define _NEGD1(D) "neg %[" #D "]\n\t"
+#define _ADJD1(D,E) "add %[" #D "], %[" #E "]\n\t"
+#define ADJDITHER2(D, E) asm __volatile__ ( _NEGD1(D) _ADJD1(D, E) ASM_VARS);
+#define ADDDE1(D, E) asm __volatile__ ( _ADJD1(D, E) ASM_VARS );
+
+// #define xstr(a) str(a)
+// #define str(a) #a
+// #define ADJDITHER2(D,E) asm __volatile__("subi %[" #D "], " xstr(DUSE) "\n\tand %[" #D "], %[" #E "]\n\t" ASM_VARS);
+
+// define the beginning of the loop
+#define LOOP asm __volatile__("1:" ASM_VARS );
+// define the end of the loop
+#define DONE asm __volatile__("2:" ASM_VARS );
+
+// 2 cycles - increment the data pointer
+#define IDATA2 asm __volatile__("add %A[data], %[ADV]\n\tadc %B[data], __zero_reg__\n\t"  ASM_VARS );
+#define IDATACLC3 asm __volatile__("add %A[data], %[ADV]\n\tadc %B[data], __zero_reg__\n\t" _CLC1  ASM_VARS );
+
+// 1 cycle mov
+#define _MOV1(B1, B2) "mov %[" #B1 "], %[" #B2 "]\n\t"
+
+#define MOV1(B1, B2) asm __volatile__( _MOV1(B1,B2) ASM_VARS );
+
+// 3 cycle mov - skip if scale fix is happening
+#if (FASTLED_SCALE8_FIXED == 1)
+#define _MOV_FIX03(B1, B2) "mov %[" #B1 "], %[scale_base]\n\tcpse %[s0], __zero_reg__\n\t" _MOV1(B1, B2)
+#define _MOV_FIX13(B1, B2) "mov %[" #B1 "], %[scale_base]\n\tcpse %[s1], __zero_reg__\n\t" _MOV1(B1, B2)
+#define _MOV_FIX23(B1, B2) "mov %[" #B1 "], %[scale_base]\n\tcpse %[s2], __zero_reg__\n\t" _MOV1(B1, B2)
+#else
+// if we haven't fixed scale8, just do the move and nop the 2 cycles that would be used to
+// do the fixed adjustment
+#define _MOV_FIX03(B1, B2) _MOV1(B1, B2) "rjmp .+0\n\t"
+#define _MOV_FIX13(B1, B2) _MOV1(B1, B2) "rjmp .+0\n\t"
+#define _MOV_FIX23(B1, B2) _MOV1(B1, B2) "rjmp .+0\n\t"
+#endif
+
+// 3 cycle mov + negate D for dither adjustment
+#define MOV_NEGD04(B1, B2, D) asm __volatile( _MOV_FIX03(B1, B2) _NEGD1(D) ASM_VARS );
+#define MOV_ADDDE04(B1, B2, D, E) asm __volatile( _MOV_FIX03(B1, B2) _ADJD1(D, E) ASM_VARS );
+#define MOV_NEGD14(B1, B2, D) asm __volatile( _MOV_FIX13(B1, B2) _NEGD1(D) ASM_VARS );
+#define MOV_ADDDE14(B1, B2, D, E) asm __volatile( _MOV_FIX13(B1, B2) _ADJD1(D, E) ASM_VARS );
+#define MOV_NEGD24(B1, B2, D) asm __volatile( _MOV_FIX23(B1, B2) _NEGD1(D) ASM_VARS );
+
+// 2 cycles - decrement the counter
+#define DCOUNT2 asm __volatile__("sbiw %[count], 1" ASM_VARS );
+// 2 cycles - jump to the beginning of the loop
+#define JMPLOOP2 asm __volatile__("rjmp 1b" ASM_VARS );
+// 2 cycles - jump out of the loop
+#define BRLOOP1 asm __volatile__("brne 3\n\trjmp 2f\n\t3:" ASM_VARS );
+
+// 5 cycles 2 sbiw, 3 for the breq/rjmp
+#define ENDLOOP5 asm __volatile__("sbiw %[count], 1\n\tbreq L_%=\n\trjmp 1b\n\tL_%=:\n\t" ASM_VARS);
+
+// NOP using the variables, forcing a move
+#define DNOP asm __volatile__("mov r0,r0" ASM_VARS);
+
+#define DADVANCE 3
+#define DUSE (0xFF - (DADVANCE-1))
+
+// Silence compiler warnings about switch/case that is explicitly intended to fall through.
+#define FL_FALLTHROUGH __attribute__ ((fallthrough));
+
+	// This method is made static to force making register Y available to use for data on AVR - if the method is non-static, then
+	// gcc will use register Y for the this pointer.
+	static void /*__attribute__((optimize("O0")))*/  /*__attribute__ ((always_inline))*/  showRGBInternal(PixelController<RGB_ORDER> & pixels)  {
+		uint8_t *data = (uint8_t*)pixels.mData;
+		data_ptr_t port = FastPin<DATA_PIN>::port();
+		data_t mask = FastPin<DATA_PIN>::mask();
+		uint8_t scale_base = 0;
+
+		// register uint8_t *end = data + nLeds;
+		data_t hi = *port | mask;
+		data_t lo = *port & ~mask;
+		*port = lo;
+
+		// the byte currently being written out
+		uint8_t b0 = 0;
+		// the byte currently being worked on to write the next out
+		uint8_t b1 = 0;
+
+		// Setup the pixel controller
+		pixels.preStepFirstByteDithering();
+
+		// pull the dithering/adjustment values out of the pixels object for direct asm access
+		uint8_t advanceBy = pixels.advanceBy();
+		uint16_t count = pixels.mLen;
+
+		uint8_t s0 = pixels.mScale.raw[RO(0)];
+		uint8_t s1 = pixels.mScale.raw[RO(1)];
+		uint8_t s2 = pixels.mScale.raw[RO(2)];
+#if (FASTLED_SCALE8_FIXED==1)
+		s0++; s1++; s2++;
+#endif
+		uint8_t d0 = pixels.d[RO(0)];
+		uint8_t d1 = pixels.d[RO(1)];
+		uint8_t d2 = pixels.d[RO(2)];
+		uint8_t e0 = pixels.e[RO(0)];
+		uint8_t e1 = pixels.e[RO(1)];
+		uint8_t e2 = pixels.e[RO(2)];
+
+		uint8_t loopvar=0;
+
+		// This has to be done in asm to keep gcc from messing up the asm code further down
+		b0 = data[RO(0)];
+		{
+			LDSCL4(b0,O0) 	PRESCALEA2(d0)
+			PRESCALEB4(d0)	SCALE02(b0,0)
+			RORSC04(b0,1) 	ROR1(b0) CLC1
+			SCROR04(b0,2)		SCALE02(b0,3)
+			RORSC04(b0,4) 	ROR1(b0) CLC1
+			SCROR04(b0,5) 	SCALE02(b0,6)
+			RORSC04(b0,7) 	ROR1(b0) CLC1
+			MOV_ADDDE04(b1,b0,d0,e0)
+			MOV1(b0,b1)
+		}
+
+		{
+			// while(--count)
+			{
+				// Loop beginning
+				DNOP;
+				LOOP;
+
+				// Sum of the clock counts across each row should be 10 for 8Mhz, WS2811
+				// The values in the D1/D2/D3 indicate how many cycles the previous column takes
+				// to allow things to line back up.
+				//
+				// While writing out byte 0, we're loading up byte 1, applying the dithering adjustment,
+				// then scaling it using 8 cycles of shift/add interleaved in between writing the bits
+				// out.  When doing byte 1, we're doing the above for byte 2.  When we're doing byte 2,
+				// we're cycling back around and doing the above for byte 0.
+
+				// Inline scaling - RGB ordering
+				// DNOP
+				HI1 _D1(1) QLO2(b0, 7) LDSCL4(b1,O1) 	_D2(4)	LO1	PRESCALEA2(d1)	_D3(2)
+				HI1 _D1(1) QLO2(b0, 6) PRESCALEB4(d1)	_D2(4)	LO1	SCALE12(b1,0)	_D3(2)
+				HI1 _D1(1) QLO2(b0, 5) RORSC14(b1,1) 	_D2(4)	LO1 RORCLC2(b1)		_D3(2)
+				HI1 _D1(1) QLO2(b0, 4) SCROR14(b1,2)	_D2(4)	LO1 SCALE12(b1,3)	_D3(2)
+				HI1 _D1(1) QLO2(b0, 3) RORSC14(b1,4) 	_D2(4)	LO1 RORCLC2(b1) 	_D3(2)
+				HI1 _D1(1) QLO2(b0, 2) SCROR14(b1,5) 	_D2(4)	LO1 SCALE12(b1,6)	_D3(2)
+				HI1 _D1(1) QLO2(b0, 1) RORSC14(b1,7) 	_D2(4)	LO1 RORCLC2(b1) 	_D3(2)
+				HI1 _D1(1) QLO2(b0, 0)
+				switch(XTRA0) {
+					case 4: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0)  FL_FALLTHROUGH
+					case 3: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0)  FL_FALLTHROUGH
+					case 2: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0)  FL_FALLTHROUGH
+					case 1: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0)
+				}
+				MOV_ADDDE14(b0,b1,d1,e1) _D2(4) LO1 _D3(0)
+
+				HI1 _D1(1) QLO2(b0, 7) LDSCL4(b1,O2) 	_D2(4)	LO1	PRESCALEA2(d2)	_D3(2)
+				HI1 _D1(1) QLO2(b0, 6) PSBIDATA4(d2)	_D2(4)	LO1	SCALE22(b1,0)	_D3(2)
+				HI1 _D1(1) QLO2(b0, 5) RORSC24(b1,1) 	_D2(4)	LO1 RORCLC2(b1) 	_D3(2)
+				HI1 _D1(1) QLO2(b0, 4) SCROR24(b1,2)	_D2(4)	LO1 SCALE22(b1,3)	_D3(2)
+				HI1 _D1(1) QLO2(b0, 3) RORSC24(b1,4) 	_D2(4)	LO1 RORCLC2(b1) 	_D3(2)
+				HI1 _D1(1) QLO2(b0, 2) SCROR24(b1,5) 	_D2(4)	LO1 SCALE22(b1,6)	_D3(2)
+				HI1 _D1(1) QLO2(b0, 1) RORSC24(b1,7) 	_D2(4)	LO1 RORCLC2(b1) 	_D3(2)
+				HI1 _D1(1) QLO2(b0, 0)
+				switch(XTRA0) {
+					case 4: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0)  FL_FALLTHROUGH
+					case 3: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0)  FL_FALLTHROUGH
+					case 2: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0)  FL_FALLTHROUGH
+					case 1: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0)
+				}
+
+				// Because Prescale on the middle byte also increments the data counter,
+				// we have to do both halves of updating d2 here - negating it (in the
+				// MOV_NEGD24 macro) and then adding E back into it
+				MOV_NEGD24(b0,b1,d2) _D2(4) LO1 ADDDE1(d2,e2) _D3(1)
+				HI1 _D1(1) QLO2(b0, 7) LDSCL4(b1,O0) 	_D2(4)	LO1	PRESCALEA2(d0)	_D3(2)
+				HI1 _D1(1) QLO2(b0, 6) PRESCALEB4(d0)	_D2(4)	LO1	SCALE02(b1,0)	_D3(2)
+				HI1 _D1(1) QLO2(b0, 5) RORSC04(b1,1) 	_D2(4)	LO1 RORCLC2(b1) 	_D3(2)
+				HI1 _D1(1) QLO2(b0, 4) SCROR04(b1,2)	_D2(4)	LO1 SCALE02(b1,3)	_D3(2)
+				HI1 _D1(1) QLO2(b0, 3) RORSC04(b1,4) 	_D2(4)	LO1 RORCLC2(b1)  	_D3(2)
+				HI1 _D1(1) QLO2(b0, 2) SCROR04(b1,5) 	_D2(4)	LO1 SCALE02(b1,6)	_D3(2)
+				HI1 _D1(1) QLO2(b0, 1) RORSC04(b1,7) 	_D2(4)	LO1 RORCLC2(b1) 	_D3(2)
+				HI1 _D1(1) QLO2(b0, 0)
+				switch(XTRA0) {
+					case 4: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0)  FL_FALLTHROUGH
+					case 3: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0)  FL_FALLTHROUGH
+					case 2: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0)  FL_FALLTHROUGH
+					case 1: _D2(0) LO1 _D3(0) HI1 _D1(1) QLO2(b0,0)
+				}
+				MOV_ADDDE04(b0,b1,d0,e0) _D2(4) LO1 _D3(5)
+				ENDLOOP5
+			}
+			DONE;
+		}
+
+		#if (FASTLED_ALLOW_INTERRUPTS == 1)
+		// stop using the clock juggler
+		TCCR0A &= ~0x30;
+		#endif
+	}
+
+};
+
+#endif
+
+FASTLED_NAMESPACE_END
+
+#endif
@@ -0,0 +1,13 @@
+#ifndef __INC_FASTLED_AVR_H
+#define __INC_FASTLED_AVR_H
+
+#include "fastpin_avr.h"
+#include "fastspi_avr.h"
+#include "clockless_trinket.h"
+
+// Default to using PROGMEM
+#ifndef FASTLED_USE_PROGMEM
+#define FASTLED_USE_PROGMEM 1
+#endif
+
+#endif
@@ -0,0 +1,361 @@
+#ifndef __INC_FASTPIN_AVR_H
+#define __INC_FASTPIN_AVR_H
+
+FASTLED_NAMESPACE_BEGIN
+
+#if defined(FASTLED_FORCE_SOFTWARE_PINS)
+#warning "Software pin support forced, pin access will be slightly slower."
+#define NO_HARDWARE_PIN_SUPPORT
+#undef HAS_HARDWARE_PIN_SUPPORT
+
+#else
+
+#define AVR_PIN_CYCLES(_PIN) ((((int)FastPin<_PIN>::port())-0x20 < 64) ? 1 : 2)
+
+/// Class definition for a Pin where we know the port registers at compile time for said pin.  This allows us to make
+/// a lot of optimizations, as the inlined hi/lo methods will devolve to a single io register write/bitset.
+template<uint8_t PIN, uint8_t _MASK, typename _PORT, typename _DDR, typename _PIN> class _AVRPIN {
+public:
+	typedef volatile uint8_t * port_ptr_t;
+	typedef uint8_t port_t;
+
+	inline static void setOutput() { _DDR::r() |= _MASK; }
+	inline static void setInput() { _DDR::r() &= ~_MASK; }
+
+	inline static void hi() __attribute__ ((always_inline)) { _PORT::r() |= _MASK; }
+	inline static void lo() __attribute__ ((always_inline)) { _PORT::r() &= ~_MASK; }
+	inline static void set(register uint8_t val) __attribute__ ((always_inline)) { _PORT::r() = val; }
+
+	inline static void strobe() __attribute__ ((always_inline)) { toggle(); toggle(); }
+
+	inline static void toggle() __attribute__ ((always_inline)) { _PIN::r() = _MASK; }
+
+	inline static void hi(register port_ptr_t /*port*/) __attribute__ ((always_inline)) { hi(); }
+	inline static void lo(register port_ptr_t /*port*/) __attribute__ ((always_inline)) { lo(); }
+	inline static void fastset(register port_ptr_t /*port*/, register uint8_t val) __attribute__ ((always_inline)) { set(val); }
+
+	inline static port_t hival() __attribute__ ((always_inline)) { return _PORT::r() | _MASK; }
+	inline static port_t loval() __attribute__ ((always_inline)) { return _PORT::r() & ~_MASK; }
+	inline static port_ptr_t port() __attribute__ ((always_inline)) { return &_PORT::r(); }
+	inline static port_t mask() __attribute__ ((always_inline)) { return _MASK; }
+};
+
+
+
+/// AVR definitions for pins.  Getting around  the fact that I can't pass GPIO register addresses in as template arguments by instead creating
+/// a custom type for each GPIO register with a single, static, aggressively inlined function that returns that specific GPIO register.  A similar
+/// trick is used a bit further below for the ARM GPIO registers (of which there are far more than on AVR!)
+typedef volatile uint8_t & reg8_t;
+#define _R(T) struct __gen_struct_ ## T
+#define _RD8(T) struct __gen_struct_ ## T { static inline reg8_t r() { return T; }};
+#define _FL_IO(L,C) _RD8(DDR ## L); _RD8(PORT ## L); _RD8(PIN ## L); _FL_DEFINE_PORT3(L, C, _R(PORT ## L));
+#define _FL_DEFPIN(_PIN, BIT, L) template<> class FastPin<_PIN> : public _AVRPIN<_PIN, 1<<BIT, _R(PORT ## L), _R(DDR ## L), _R(PIN ## L)> {};
+
+// Pre-do all the port definitions
+#ifdef PORTA
+  _FL_IO(A,0)
+#endif
+#ifdef PORTB
+  _FL_IO(B,1)
+#endif
+#ifdef PORTC
+  _FL_IO(C,2)
+#endif
+#ifdef PORTD
+  _FL_IO(D,3)
+#endif
+#ifdef PORTE
+  _FL_IO(E,4)
+#endif
+#ifdef PORTF
+  _FL_IO(F,5)
+#endif
+#ifdef PORTG
+  _FL_IO(G,6)
+#endif
+#ifdef PORTH
+  _FL_IO(H,7)
+#endif
+#ifdef PORTI
+  _FL_IO(I,8)
+#endif
+#ifdef PORTJ
+  _FL_IO(J,9)
+#endif
+#ifdef PORTK
+  _FL_IO(K,10)
+#endif
+#ifdef PORTL
+  _FL_IO(L,11)
+#endif
+#ifdef PORTM
+  _FL_IO(M,12)
+#endif
+#ifdef PORTN
+  _FL_IO(N,13)
+#endif
+
+#if defined(__AVR_ATtiny85__) || defined(__AVR_ATtiny45__) || defined(__AVR_ATtiny25__)
+
+#if defined(__AVR_ATtiny25__)
+#pragma message "ATtiny25 has very limited storage. This library could use up to more than 100% of its flash size"
+#endif
+
+#define MAX_PIN 5
+
+_FL_DEFPIN(0, 0, B); _FL_DEFPIN(1, 1, B); _FL_DEFPIN(2, 2, B); _FL_DEFPIN(3, 3, B);
+_FL_DEFPIN(4, 4, B); _FL_DEFPIN(5, 5, B);
+
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+#elif defined(__AVR_ATtiny841__) || defined(__AVR_ATtiny441__)
+#define MAX_PIN 11
+
+_FL_DEFPIN(0, 0, B); _FL_DEFPIN(1, 1, B); _FL_DEFPIN(2, 2, B);
+_FL_DEFPIN(3, 7, A); _FL_DEFPIN(4, 6, A); _FL_DEFPIN(5, 5, A);
+_FL_DEFPIN(6, 4, A); _FL_DEFPIN(7, 3, A); _FL_DEFPIN(8, 2, A);
+_FL_DEFPIN(9, 1, A); _FL_DEFPIN(10, 0, A); _FL_DEFPIN(11, 3, B);
+
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+#elif defined(ARDUINO_AVR_DIGISPARK) // digispark pin layout
+#define MAX_PIN 5
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+_FL_DEFPIN(0, 0, B); _FL_DEFPIN(1, 1, B); _FL_DEFPIN(2, 2, B);
+_FL_DEFPIN(3, 7, A); _FL_DEFPIN(4, 6, A); _FL_DEFPIN(5, 5, A);
+
+#elif defined(__AVR_ATtiny24__) || defined(__AVR_ATtiny44__) || defined(__AVR_ATtiny84__)
+
+#define MAX_PIN 10
+
+_FL_DEFPIN(0, 0, A); _FL_DEFPIN(1, 1, A); _FL_DEFPIN(2, 2, A); _FL_DEFPIN(3, 3, A);
+_FL_DEFPIN(4, 4, A); _FL_DEFPIN(5, 5, A); _FL_DEFPIN(6, 6, A); _FL_DEFPIN(7, 7, A);
+_FL_DEFPIN(8, 2, B); _FL_DEFPIN(9, 1, B); _FL_DEFPIN(10, 0, B);
+
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+#elif defined(ARDUINO_AVR_DIGISPARKPRO)
+
+#define MAX_PIN 12
+
+_FL_DEFPIN(0, 0, B); _FL_DEFPIN(1, 1, B); _FL_DEFPIN(2, 2, B); _FL_DEFPIN(3, 5, B);
+_FL_DEFPIN(4, 3, B); _FL_DEFPIN(5, 7, A); _FL_DEFPIN(6, 0, A); _FL_DEFPIN(7, 1, A);
+_FL_DEFPIN(8, 2, A); _FL_DEFPIN(9, 3, A); _FL_DEFPIN(10, 4, A); _FL_DEFPIN(11, 5, A);
+_FL_DEFPIN(12, 6, A);
+
+#elif defined(__AVR_ATtiny167__) || defined(__AVR_ATtiny87__)
+
+#define MAX_PIN 15
+
+_FL_DEFPIN(0, 0, A);  _FL_DEFPIN(1, 1, A);   _FL_DEFPIN(2, 2, A);  _FL_DEFPIN(3, 3, A);
+_FL_DEFPIN(4, 4, A);  _FL_DEFPIN(5, 5, A);   _FL_DEFPIN(6, 6, A);  _FL_DEFPIN(7, 7, A);
+_FL_DEFPIN(8, 0, B);  _FL_DEFPIN(9, 1, B);   _FL_DEFPIN(10, 2, B); _FL_DEFPIN(11, 3, B);
+_FL_DEFPIN(12, 4, B); _FL_DEFPIN(13, 5, B); _FL_DEFPIN(14, 6, B); _FL_DEFPIN(15, 7, B);
+
+#define SPI_DATA 4
+#define SPI_CLOCK 5
+#define AVR_HARDWARE_SPI 1
+
+#define HAS_HARDWARE_PIN_SUPPORT 1
+#elif defined(ARDUINO_HOODLOADER2) && (defined(__AVR_ATmega32U2__) || defined(__AVR_ATmega16U2__) || defined(__AVR_ATmega8U2__)) || defined(__AVR_AT90USB82__) || defined(__AVR_AT90USB162__)
+
+#define MAX_PIN 20
+
+_FL_DEFPIN( 0, 0, B); _FL_DEFPIN( 1, 1, B); _FL_DEFPIN( 2, 2, B); _FL_DEFPIN( 3, 3, B);
+_FL_DEFPIN( 4, 4, B); _FL_DEFPIN( 5, 5, B); _FL_DEFPIN( 6, 6, B); _FL_DEFPIN( 7, 7, B);
+
+_FL_DEFPIN( 8, 7, C); _FL_DEFPIN( 9, 6, C); _FL_DEFPIN( 10, 5,C); _FL_DEFPIN( 11, 4, C);
+_FL_DEFPIN( 12, 2, C); _FL_DEFPIN( 13, 0, D); _FL_DEFPIN( 14, 1, D); _FL_DEFPIN(15, 2, D);
+_FL_DEFPIN( 16, 3, D); _FL_DEFPIN( 17, 4, D); _FL_DEFPIN( 18, 5, D); _FL_DEFPIN( 19, 6, D);
+_FL_DEFPIN( 20, 7, D);
+
+#define HAS_HARDWARE_PIN_SUPPORT 1
+// #define SPI_DATA 2
+// #define SPI_CLOCK 1
+// #define AVR_HARDWARE_SPI 1
+
+#elif defined(IS_BEAN)
+
+#define MAX_PIN 19
+_FL_DEFPIN( 0, 6, D); _FL_DEFPIN( 1, 1, B); _FL_DEFPIN( 2, 2, B); _FL_DEFPIN( 3, 3, B);
+_FL_DEFPIN( 4, 4, B); _FL_DEFPIN( 5, 5, B); _FL_DEFPIN( 6, 0, D); _FL_DEFPIN( 7, 7, D);
+_FL_DEFPIN( 8, 0, B); _FL_DEFPIN( 9, 1, D); _FL_DEFPIN(10, 2, D); _FL_DEFPIN(11, 3, D);
+_FL_DEFPIN(12, 4, D); _FL_DEFPIN(13, 5, D); _FL_DEFPIN(14, 0, C); _FL_DEFPIN(15, 1, C);
+_FL_DEFPIN(16, 2, C); _FL_DEFPIN(17, 3, C); _FL_DEFPIN(18, 4, C); _FL_DEFPIN(19, 5, C);
+
+#define SPI_DATA 3
+#define SPI_CLOCK 5
+#define SPI_SELECT 2
+#define AVR_HARDWARE_SPI 1
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+#ifndef __AVR_ATmega8__
+#define SPI_UART0_DATA 9
+#define SPI_UART0_CLOCK 12
+#endif
+
+#elif defined(__AVR_ATmega328P__) || defined(__AVR_ATmega328PB__) || defined(__AVR_ATmega328__) || defined(__AVR_ATmega168__) || defined(__AVR_ATmega168P__) || defined(__AVR_ATmega8__)
+
+#define MAX_PIN 19
+_FL_DEFPIN( 0, 0, D); _FL_DEFPIN( 1, 1, D); _FL_DEFPIN( 2, 2, D); _FL_DEFPIN( 3, 3, D);
+_FL_DEFPIN( 4, 4, D); _FL_DEFPIN( 5, 5, D); _FL_DEFPIN( 6, 6, D); _FL_DEFPIN( 7, 7, D);
+_FL_DEFPIN( 8, 0, B); _FL_DEFPIN( 9, 1, B); _FL_DEFPIN(10, 2, B); _FL_DEFPIN(11, 3, B);
+_FL_DEFPIN(12, 4, B); _FL_DEFPIN(13, 5, B); _FL_DEFPIN(14, 0, C); _FL_DEFPIN(15, 1, C);
+_FL_DEFPIN(16, 2, C); _FL_DEFPIN(17, 3, C); _FL_DEFPIN(18, 4, C); _FL_DEFPIN(19, 5, C);
+
+#define SPI_DATA 11
+#define SPI_CLOCK 13
+#define SPI_SELECT 10
+#define AVR_HARDWARE_SPI 1
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+#ifndef __AVR_ATmega8__
+#define SPI_UART0_DATA 1
+#define SPI_UART0_CLOCK 4
+#endif
+
+#elif defined(__AVR_ATmega1284P__) || defined(__AVR_ATmega644P__) || defined(__AVR_ATmega32__) || defined(__AVR_ATmega16__)
+
+#define MAX_PIN 31
+_FL_DEFPIN(0, 0, B); _FL_DEFPIN(1, 1, B); _FL_DEFPIN(2, 2, B); _FL_DEFPIN(3, 3, B);
+_FL_DEFPIN(4, 4, B); _FL_DEFPIN(5, 5, B); _FL_DEFPIN(6, 6, B); _FL_DEFPIN(7, 7, B);
+_FL_DEFPIN(8, 0, D); _FL_DEFPIN(9, 1, D); _FL_DEFPIN(10, 2, D); _FL_DEFPIN(11, 3, D);
+_FL_DEFPIN(12, 4, D); _FL_DEFPIN(13, 5, D); _FL_DEFPIN(14, 6, D); _FL_DEFPIN(15, 7, D);
+_FL_DEFPIN(16, 0, C); _FL_DEFPIN(17, 1, C); _FL_DEFPIN(18, 2, C); _FL_DEFPIN(19, 3, C);
+_FL_DEFPIN(20, 4, C); _FL_DEFPIN(21, 5, C); _FL_DEFPIN(22, 6, C); _FL_DEFPIN(23, 7, C);
+_FL_DEFPIN(24, 0, A); _FL_DEFPIN(25, 1, A); _FL_DEFPIN(26, 2, A); _FL_DEFPIN(27, 3, A);
+_FL_DEFPIN(28, 4, A); _FL_DEFPIN(29, 5, A); _FL_DEFPIN(30, 6, A); _FL_DEFPIN(31, 7, A);
+
+#define SPI_DATA 5
+#define SPI_CLOCK 7
+#define SPI_SELECT 4
+#define AVR_HARDWARE_SPI 1
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+#elif  defined(__AVR_ATmega128RFA1__) || defined(__AVR_ATmega256RFR2__)
+
+// AKA the Pinoccio
+_FL_DEFPIN( 0, 0, E); _FL_DEFPIN( 1, 1, E); _FL_DEFPIN( 2, 7, B); _FL_DEFPIN( 3, 3, E);
+_FL_DEFPIN( 4, 4, E); _FL_DEFPIN( 5, 5, E); _FL_DEFPIN( 6, 2, E); _FL_DEFPIN( 7, 6, E);
+_FL_DEFPIN( 8, 5, D); _FL_DEFPIN( 9, 0, B); _FL_DEFPIN(10, 2, B); _FL_DEFPIN(11, 3, B);
+_FL_DEFPIN(12, 1, B); _FL_DEFPIN(13, 2, D); _FL_DEFPIN(14, 3, D); _FL_DEFPIN(15, 0, D);
+_FL_DEFPIN(16, 1, D); _FL_DEFPIN(17, 4, D); _FL_DEFPIN(18, 7, E); _FL_DEFPIN(19, 6, D);
+_FL_DEFPIN(20, 7, D); _FL_DEFPIN(21, 4, B); _FL_DEFPIN(22, 5, B); _FL_DEFPIN(23, 6, B);
+_FL_DEFPIN(24, 0, F); _FL_DEFPIN(25, 1, F); _FL_DEFPIN(26, 2, F); _FL_DEFPIN(27, 3, F);
+_FL_DEFPIN(28, 4, F); _FL_DEFPIN(29, 5, F); _FL_DEFPIN(30, 6, F); _FL_DEFPIN(31, 7, F);
+
+#define SPI_DATA 10
+#define SPI_CLOCK 12
+#define SPI_SELECT 9
+
+#define AVR_HARDWARE_SPI 1
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+#elif defined(__AVR_ATmega1280__) || defined(__AVR_ATmega2560__)
+// megas
+#define MAX_PIN 69
+_FL_DEFPIN(0, 0, E); _FL_DEFPIN(1, 1, E); _FL_DEFPIN(2, 4, E); _FL_DEFPIN(3, 5, E);
+_FL_DEFPIN(4, 5, G); _FL_DEFPIN(5, 3, E); _FL_DEFPIN(6, 3, H); _FL_DEFPIN(7, 4, H);
+_FL_DEFPIN(8, 5, H); _FL_DEFPIN(9, 6, H); _FL_DEFPIN(10, 4, B); _FL_DEFPIN(11, 5, B);
+_FL_DEFPIN(12, 6, B); _FL_DEFPIN(13, 7, B); _FL_DEFPIN(14, 1, J); _FL_DEFPIN(15, 0, J);
+_FL_DEFPIN(16, 1, H); _FL_DEFPIN(17, 0, H); _FL_DEFPIN(18, 3, D); _FL_DEFPIN(19, 2, D);
+_FL_DEFPIN(20, 1, D); _FL_DEFPIN(21, 0, D); _FL_DEFPIN(22, 0, A); _FL_DEFPIN(23, 1, A);
+_FL_DEFPIN(24, 2, A); _FL_DEFPIN(25, 3, A); _FL_DEFPIN(26, 4, A); _FL_DEFPIN(27, 5, A);
+_FL_DEFPIN(28, 6, A); _FL_DEFPIN(29, 7, A); _FL_DEFPIN(30, 7, C); _FL_DEFPIN(31, 6, C);
+_FL_DEFPIN(32, 5, C); _FL_DEFPIN(33, 4, C); _FL_DEFPIN(34, 3, C); _FL_DEFPIN(35, 2, C);
+_FL_DEFPIN(36, 1, C); _FL_DEFPIN(37, 0, C); _FL_DEFPIN(38, 7, D); _FL_DEFPIN(39, 2, G);
+_FL_DEFPIN(40, 1, G); _FL_DEFPIN(41, 0, G); _FL_DEFPIN(42, 7, L); _FL_DEFPIN(43, 6, L);
+_FL_DEFPIN(44, 5, L); _FL_DEFPIN(45, 4, L); _FL_DEFPIN(46, 3, L); _FL_DEFPIN(47, 2, L);
+_FL_DEFPIN(48, 1, L); _FL_DEFPIN(49, 0, L); _FL_DEFPIN(50, 3, B); _FL_DEFPIN(51, 2, B);
+_FL_DEFPIN(52, 1, B); _FL_DEFPIN(53, 0, B); _FL_DEFPIN(54, 0, F); _FL_DEFPIN(55, 1, F);
+_FL_DEFPIN(56, 2, F); _FL_DEFPIN(57, 3, F); _FL_DEFPIN(58, 4, F); _FL_DEFPIN(59, 5, F);
+_FL_DEFPIN(60, 6, F); _FL_DEFPIN(61, 7, F); _FL_DEFPIN(62, 0, K); _FL_DEFPIN(63, 1, K);
+_FL_DEFPIN(64, 2, K); _FL_DEFPIN(65, 3, K); _FL_DEFPIN(66, 4, K); _FL_DEFPIN(67, 5, K);
+_FL_DEFPIN(68, 6, K); _FL_DEFPIN(69, 7, K);
+
+#define SPI_DATA 51
+#define SPI_CLOCK 52
+#define SPI_SELECT 53
+#define AVR_HARDWARE_SPI 1
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+// Leonardo, teensy, blinkm
+#elif defined(__AVR_ATmega32U4__) && defined(CORE_TEENSY)
+
+// teensy defs
+#define MAX_PIN 23
+_FL_DEFPIN(0, 0, B); _FL_DEFPIN(1, 1, B); _FL_DEFPIN(2, 2, B); _FL_DEFPIN(3, 3, B);
+_FL_DEFPIN(4, 7, B); _FL_DEFPIN(5, 0, D); _FL_DEFPIN(6, 1, D); _FL_DEFPIN(7, 2, D);
+_FL_DEFPIN(8, 3, D); _FL_DEFPIN(9, 6, C); _FL_DEFPIN(10, 7, C); _FL_DEFPIN(11, 6, D);
+_FL_DEFPIN(12, 7, D); _FL_DEFPIN(13, 4, B); _FL_DEFPIN(14, 5, B); _FL_DEFPIN(15, 6, B);
+_FL_DEFPIN(16, 7, F); _FL_DEFPIN(17, 6, F); _FL_DEFPIN(18, 5, F); _FL_DEFPIN(19, 4, F);
+_FL_DEFPIN(20, 1, F); _FL_DEFPIN(21, 0, F); _FL_DEFPIN(22, 4, D); _FL_DEFPIN(23, 5, D);
+
+#define SPI_DATA 2
+#define SPI_CLOCK 1
+#define SPI_SELECT 0
+#define AVR_HARDWARE_SPI 1
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+// PD3/PD5
+#define SPI_UART1_DATA 8
+#define SPI_UART1_CLOCK 23
+
+#elif defined(__AVR_AT90USB646__) || defined(__AVR_AT90USB1286__)
+// teensy++ 2 defs
+#define MAX_PIN 45
+_FL_DEFPIN(0, 0, D); _FL_DEFPIN(1, 1, D); _FL_DEFPIN(2, 2, D); _FL_DEFPIN(3, 3, D);
+_FL_DEFPIN(4, 4, D); _FL_DEFPIN(5, 5, D); _FL_DEFPIN(6, 6, D); _FL_DEFPIN(7, 7, D);
+_FL_DEFPIN(8, 0, E); _FL_DEFPIN(9, 1, E); _FL_DEFPIN(10, 0, C); _FL_DEFPIN(11, 1, C);
+_FL_DEFPIN(12, 2, C); _FL_DEFPIN(13, 3, C); _FL_DEFPIN(14, 4, C); _FL_DEFPIN(15, 5, C);
+_FL_DEFPIN(16, 6, C); _FL_DEFPIN(17, 7, C); _FL_DEFPIN(18, 6, E); _FL_DEFPIN(19, 7, E);
+_FL_DEFPIN(20, 0, B); _FL_DEFPIN(21, 1, B); _FL_DEFPIN(22, 2, B); _FL_DEFPIN(23, 3, B);
+_FL_DEFPIN(24, 4, B); _FL_DEFPIN(25, 5, B); _FL_DEFPIN(26, 6, B); _FL_DEFPIN(27, 7, B);
+_FL_DEFPIN(28, 0, A); _FL_DEFPIN(29, 1, A); _FL_DEFPIN(30, 2, A); _FL_DEFPIN(31, 3, A);
+_FL_DEFPIN(32, 4, A); _FL_DEFPIN(33, 5, A); _FL_DEFPIN(34, 6, A); _FL_DEFPIN(35, 7, A);
+_FL_DEFPIN(36, 4, E); _FL_DEFPIN(37, 5, E); _FL_DEFPIN(38, 0, F); _FL_DEFPIN(39, 1, F);
+_FL_DEFPIN(40, 2, F); _FL_DEFPIN(41, 3, F); _FL_DEFPIN(42, 4, F); _FL_DEFPIN(43, 5, F);
+_FL_DEFPIN(44, 6, F); _FL_DEFPIN(45, 7, F);
+
+#define SPI_DATA 22
+#define SPI_CLOCK 21
+#define SPI_SELECT 20
+#define AVR_HARDWARE_SPI 1
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+// PD3/PD5
+#define SPI_UART1_DATA 3
+#define SPI_UART1_CLOCK 5
+
+
+#elif defined(__AVR_ATmega32U4__)
+
+// leonard defs
+#define MAX_PIN 30
+_FL_DEFPIN(0, 2, D); _FL_DEFPIN(1, 3, D); _FL_DEFPIN(2, 1, D); _FL_DEFPIN(3, 0, D);
+_FL_DEFPIN(4, 4, D); _FL_DEFPIN(5, 6, C); _FL_DEFPIN(6, 7, D); _FL_DEFPIN(7, 6, E);
+_FL_DEFPIN(8, 4, B); _FL_DEFPIN(9, 5, B); _FL_DEFPIN(10, 6, B); _FL_DEFPIN(11, 7, B);
+_FL_DEFPIN(12, 6, D); _FL_DEFPIN(13, 7, C); _FL_DEFPIN(14, 3, B); _FL_DEFPIN(15, 1, B);
+_FL_DEFPIN(16, 2, B); _FL_DEFPIN(17, 0, B); _FL_DEFPIN(18, 7, F); _FL_DEFPIN(19, 6, F);
+_FL_DEFPIN(20, 5, F); _FL_DEFPIN(21, 4, F); _FL_DEFPIN(22, 1, F); _FL_DEFPIN(23, 0, F);
+_FL_DEFPIN(24, 4, D); _FL_DEFPIN(25, 7, D); _FL_DEFPIN(26, 4, B); _FL_DEFPIN(27, 5, B);
+_FL_DEFPIN(28, 6, B); _FL_DEFPIN(29, 6, D); _FL_DEFPIN(30, 5, D);
+
+#define SPI_DATA 16
+#define SPI_CLOCK 15
+#define AVR_HARDWARE_SPI 1
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+// PD3/PD5
+#define SPI_UART1_DATA 1
+#define SPI_UART1_CLOCK 30
+
+
+#endif
+
+#endif // FASTLED_FORCE_SOFTWARE_PINS
+
+FASTLED_NAMESPACE_END
+
+#endif // __INC_FASTPIN_AVR_H
@@ -0,0 +1,682 @@
+#ifndef __INC_FASTSPI_AVR_H
+#define __INC_FASTSPI_AVR_H
+
+FASTLED_NAMESPACE_BEGIN
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Hardware SPI support using USART registers and friends
+//
+// TODO: Complete/test implementation - right now this doesn't work
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// uno/mini/duemilanove
+#if defined(AVR_HARDWARE_SPI)
+
+#if defined(UBRR1)
+
+#ifndef UCPHA1
+#define UCPHA1 1
+#endif
+
+template <uint8_t _DATA_PIN, uint8_t _CLOCK_PIN, uint32_t _SPI_CLOCK_DIVIDER>
+class AVRUSART1SPIOutput {
+	Selectable *m_pSelect;
+
+public:
+	AVRUSART1SPIOutput() { m_pSelect = NULL; }
+	AVRUSART1SPIOutput(Selectable *pSelect) { m_pSelect = pSelect; }
+	void setSelect(Selectable *pSelect) { m_pSelect = pSelect; }
+
+	void init() {
+		UBRR1 = 0;
+
+		/* Set MSPI mode of operation and SPI data mode 0. */
+		UCSR1C = (1<<UMSEL11)|(1<<UMSEL10)|(0<<UCPHA1)|(0<<UCPOL1);
+		/* Enable receiver and transmitter. */
+		UCSR1B = (1<<RXEN1)|(1<<TXEN1);
+
+		FastPin<_CLOCK_PIN>::setOutput();
+		FastPin<_DATA_PIN>::setOutput();
+
+		// must be done last, see page 206
+		setSPIRate();
+	}
+
+	void setSPIRate() {
+		if(_SPI_CLOCK_DIVIDER > 2) {
+			UBRR1 = (_SPI_CLOCK_DIVIDER/2)-1;
+		} else {
+			UBRR1 = 0;
+		}
+	}
+
+
+	static void stop() {
+		// TODO: stop the uart spi output
+	}
+
+	static bool shouldWait(bool wait = false) __attribute__((always_inline)) {
+		static bool sWait=false;
+		if(sWait) {
+			sWait = wait; return true;
+		} else {
+			sWait = wait; return false;
+		}
+		// return true;
+	}
+	static void wait() __attribute__((always_inline)) {
+		if(shouldWait()) {
+			while(!(UCSR1A & (1<<UDRE1)));
+		}
+	}
+	static void waitFully() __attribute__((always_inline)) { wait(); }
+
+	static void writeWord(uint16_t w) __attribute__((always_inline)) { writeByte(w>>8); writeByte(w&0xFF); }
+
+	static void writeByte(uint8_t b) __attribute__((always_inline)) { wait(); UDR1=b;  shouldWait(true); }
+	static void writeBytePostWait(uint8_t b) __attribute__((always_inline)) { UDR1=b; shouldWait(true); wait(); }
+	static void writeByteNoWait(uint8_t b) __attribute__((always_inline)) { UDR1=b; shouldWait(true); }
+
+
+	template <uint8_t BIT> inline static void writeBit(uint8_t b) {
+		if(b && (1 << BIT)) {
+			FastPin<_DATA_PIN>::hi();
+		} else {
+			FastPin<_DATA_PIN>::lo();
+		}
+
+		FastPin<_CLOCK_PIN>::hi();
+		FastPin<_CLOCK_PIN>::lo();
+	}
+
+	void enable_pins() { }
+	void disable_pins() { }
+
+	void select() {
+		if(m_pSelect != NULL) {
+			m_pSelect->select();
+		}
+		enable_pins();
+		setSPIRate();
+	}
+
+	void release() {
+		if(m_pSelect != NULL) {
+			m_pSelect->release();
+		}
+		disable_pins();
+	}
+
+	static void writeBytesValueRaw(uint8_t value, int len) {
+		while(len--) {
+			writeByte(value);
+		}
+	}
+
+	void writeBytesValue(uint8_t value, int len) {
+		//setSPIRate();
+		select();
+		while(len--) {
+			writeByte(value);
+		}
+		release();
+	}
+
+	// Write a block of n uint8_ts out
+	template <class D> void writeBytes(register uint8_t *data, int len) {
+		//setSPIRate();
+		uint8_t *end = data + len;
+		select();
+		while(data != end) {
+			// a slight touch of delay here helps optimize the timing of the status register check loop (not used on ARM)
+			writeByte(D::adjust(*data++)); delaycycles<3>();
+		}
+		release();
+	}
+
+	void writeBytes(register uint8_t *data, int len) { writeBytes<DATA_NOP>(data, len); }
+
+	// write a block of uint8_ts out in groups of three.  len is the total number of uint8_ts to write out.  The template
+	// parameters indicate how many uint8_ts to skip at the beginning and/or end of each grouping
+	template <uint8_t FLAGS, class D, EOrder RGB_ORDER> void writePixels(PixelController<RGB_ORDER> pixels) {
+		//setSPIRate();
+		int len = pixels.mLen;
+
+		select();
+		while(pixels.has(1)) {
+			if(FLAGS & FLAG_START_BIT) {
+				writeBit<0>(1);
+				writeBytePostWait(D::adjust(pixels.loadAndScale0()));
+				writeBytePostWait(D::adjust(pixels.loadAndScale1()));
+				writeBytePostWait(D::adjust(pixels.loadAndScale2()));
+			} else {
+				writeByte(D::adjust(pixels.loadAndScale0()));
+				writeByte(D::adjust(pixels.loadAndScale1()));
+				writeByte(D::adjust(pixels.loadAndScale2()));
+			}
+
+			pixels.advanceData();
+			pixels.stepDithering();
+		}
+		D::postBlock(len);
+		release();
+	}
+};
+#endif
+
+#if defined(UBRR0)
+template <uint8_t _DATA_PIN, uint8_t _CLOCK_PIN, uint32_t _SPI_CLOCK_DIVIDER>
+class AVRUSART0SPIOutput {
+	Selectable *m_pSelect;
+
+public:
+	AVRUSART0SPIOutput() { m_pSelect = NULL; }
+	AVRUSART0SPIOutput(Selectable *pSelect) { m_pSelect = pSelect; }
+	void setSelect(Selectable *pSelect) { m_pSelect = pSelect; }
+
+	void init() {
+		UBRR0 = 0;
+
+		/* Set MSPI mode of operation and SPI data mode 0. */
+		UCSR0C = (1<<UMSEL01)|(1<<UMSEL00)/*|(0<<UCPHA0)*/|(0<<UCPOL0);
+		/* Enable receiver and transmitter. */
+		UCSR0B = (1<<RXEN0)|(1<<TXEN0);
+
+		FastPin<_CLOCK_PIN>::setOutput();
+		FastPin<_DATA_PIN>::setOutput();
+
+
+		// must be done last, see page 206
+		setSPIRate();
+	}
+
+	void setSPIRate() {
+		if(_SPI_CLOCK_DIVIDER > 2) {
+			UBRR0 = (_SPI_CLOCK_DIVIDER/2)-1;
+		} else {
+			UBRR0 = 0;
+		}
+	}
+
+	static void stop() {
+		// TODO: stop the uart spi output
+	}
+
+	static bool shouldWait(bool wait = false) __attribute__((always_inline)) {
+		static bool sWait=false;
+		if(sWait) {
+			sWait = wait; return true;
+		} else {
+			sWait = wait; return false;
+		}
+		// return true;
+	}
+	static void wait() __attribute__((always_inline)) {
+		if(shouldWait()) {
+			while(!(UCSR0A & (1<<UDRE0)));
+		}
+	}
+	static void waitFully() __attribute__((always_inline)) { wait(); }
+
+	static void writeWord(uint16_t w) __attribute__((always_inline)) { writeByte(w>>8); writeByte(w&0xFF); }
+
+	static void writeByte(uint8_t b) __attribute__((always_inline)) { wait(); UDR0=b;  shouldWait(true); }
+	static void writeBytePostWait(uint8_t b) __attribute__((always_inline)) { UDR0=b; shouldWait(true); wait(); }
+	static void writeByteNoWait(uint8_t b) __attribute__((always_inline)) { UDR0=b; shouldWait(true); }
+
+
+	template <uint8_t BIT> inline static void writeBit(uint8_t b) {
+		if(b && (1 << BIT)) {
+			FastPin<_DATA_PIN>::hi();
+		} else {
+			FastPin<_DATA_PIN>::lo();
+		}
+
+		FastPin<_CLOCK_PIN>::hi();
+		FastPin<_CLOCK_PIN>::lo();
+	}
+
+	void enable_pins() { }
+	void disable_pins() { }
+
+	void select() {
+		if(m_pSelect != NULL) {
+			m_pSelect->select();
+		}
+		enable_pins();
+		setSPIRate();
+	}
+
+		void release() {
+			if(m_pSelect != NULL) {
+				m_pSelect->release();
+			}
+			disable_pins();
+		}
+
+	static void writeBytesValueRaw(uint8_t value, int len) {
+		while(len--) {
+			writeByte(value);
+		}
+	}
+
+	void writeBytesValue(uint8_t value, int len) {
+		//setSPIRate();
+		select();
+		while(len--) {
+			writeByte(value);
+		}
+		release();
+	}
+
+	// Write a block of n uint8_ts out
+	template <class D> void writeBytes(register uint8_t *data, int len) {
+		//setSPIRate();
+		uint8_t *end = data + len;
+		select();
+		while(data != end) {
+			// a slight touch of delay here helps optimize the timing of the status register check loop (not used on ARM)
+			writeByte(D::adjust(*data++)); delaycycles<3>();
+		}
+		release();
+	}
+
+	void writeBytes(register uint8_t *data, int len) { writeBytes<DATA_NOP>(data, len); }
+
+	// write a block of uint8_ts out in groups of three.  len is the total number of uint8_ts to write out.  The template
+	// parameters indicate how many uint8_ts to skip at the beginning and/or end of each grouping
+	template <uint8_t FLAGS, class D, EOrder RGB_ORDER> void writePixels(PixelController<RGB_ORDER> pixels) {
+		//setSPIRate();
+		int len = pixels.mLen;
+
+		select();
+		while(pixels.has(1)) {
+			if(FLAGS & FLAG_START_BIT) {
+				writeBit<0>(1);
+				writeBytePostWait(D::adjust(pixels.loadAndScale0()));
+				writeBytePostWait(D::adjust(pixels.loadAndScale1()));
+				writeBytePostWait(D::adjust(pixels.loadAndScale2()));
+			} else {
+				writeByte(D::adjust(pixels.loadAndScale0()));
+				writeByte(D::adjust(pixels.loadAndScale1()));
+				writeByte(D::adjust(pixels.loadAndScale2()));
+			}
+
+			pixels.advanceData();
+			pixels.stepDithering();
+		}
+		D::postBlock(len);
+		waitFully();
+		release();
+	}
+};
+
+#endif
+
+
+#if defined(SPSR)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Hardware SPI support using SPDR registers and friends
+//
+// Technically speaking, this uses the AVR SPI registers.  This will work on the Teensy 3.0 because Paul made a set of compatability
+// classes that map the AVR SPI registers to ARM's, however this caps the performance of output.
+//
+// TODO: implement ARMHardwareSPIOutput
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <uint8_t _DATA_PIN, uint8_t _CLOCK_PIN, uint32_t _SPI_CLOCK_DIVIDER>
+class AVRHardwareSPIOutput {
+	Selectable *m_pSelect;
+	bool mWait;
+public:
+	AVRHardwareSPIOutput() { m_pSelect = NULL; mWait = false;}
+	AVRHardwareSPIOutput(Selectable *pSelect) { m_pSelect = pSelect; }
+	void setSelect(Selectable *pSelect) { m_pSelect = pSelect; }
+
+	void setSPIRate() {
+		SPCR &= ~ ( (1<<SPR1) | (1<<SPR0) ); 	// clear out the prescalar bits
+
+	    bool b2x = false;
+
+	    if(_SPI_CLOCK_DIVIDER >= 128) { SPCR |= (1<<SPR1); SPCR |= (1<<SPR0); }
+	    else if(_SPI_CLOCK_DIVIDER >= 64) { SPCR |= (1<<SPR1);}
+	    else if(_SPI_CLOCK_DIVIDER >= 32) { SPCR |= (1<<SPR1); b2x = true;  }
+	    else if(_SPI_CLOCK_DIVIDER >= 16) { SPCR |= (1<<SPR0); }
+	    else if(_SPI_CLOCK_DIVIDER >= 8) { SPCR |= (1<<SPR0); b2x = true; }
+	    else if(_SPI_CLOCK_DIVIDER >= 4) { /* do nothing - default rate */ }
+	    else { b2x = true; }
+
+	    if(b2x) { SPSR |= (1<<SPI2X); }
+	    else { SPSR &= ~ (1<<SPI2X); }
+	}
+
+	void init() {
+		volatile uint8_t clr;
+
+		// set the pins to output
+		FastPin<_DATA_PIN>::setOutput();
+		FastPin<_CLOCK_PIN>::setOutput();
+#ifdef SPI_SELECT
+		// Make sure the slave select line is set to output, or arduino will block us
+		FastPin<SPI_SELECT>::setOutput();
+		FastPin<SPI_SELECT>::lo();
+#endif
+
+		SPCR |= ((1<<SPE) | (1<<MSTR) ); 		// enable SPI as master
+		SPCR &= ~ ( (1<<SPR1) | (1<<SPR0) ); 	// clear out the prescalar bits
+
+		clr = SPSR; // clear SPI status register
+		clr = SPDR; // clear SPI data register
+		clr;
+
+	    bool b2x = false;
+
+	    if(_SPI_CLOCK_DIVIDER >= 128) { SPCR |= (1<<SPR1); SPCR |= (1<<SPR0); }
+	    else if(_SPI_CLOCK_DIVIDER >= 64) { SPCR |= (1<<SPR1);}
+	    else if(_SPI_CLOCK_DIVIDER >= 32) { SPCR |= (1<<SPR1); b2x = true;  }
+	    else if(_SPI_CLOCK_DIVIDER >= 16) { SPCR |= (1<<SPR0); }
+	    else if(_SPI_CLOCK_DIVIDER >= 8) { SPCR |= (1<<SPR0); b2x = true; }
+	    else if(_SPI_CLOCK_DIVIDER >= 4) { /* do nothing - default rate */ }
+	    else { b2x = true; }
+
+	    if(b2x) { SPSR |= (1<<SPI2X); }
+	    else { SPSR &= ~ (1<<SPI2X); }
+
+	    SPDR=0;
+	    shouldWait(false);
+			release();
+		}
+
+	static bool shouldWait(bool wait = false) __attribute__((always_inline)) {
+		static bool sWait=false;
+		if(sWait) { sWait = wait; return true; } else { sWait = wait; return false; }
+		// return true;
+	}
+	static void wait() __attribute__((always_inline)) { if(shouldWait()) { while(!(SPSR & (1<<SPIF))); } }
+	static void waitFully() __attribute__((always_inline)) { wait(); }
+
+	static void writeWord(uint16_t w) __attribute__((always_inline)) { writeByte(w>>8); writeByte(w&0xFF); }
+
+	static void writeByte(uint8_t b) __attribute__((always_inline)) { wait(); SPDR=b;  shouldWait(true); }
+	static void writeBytePostWait(uint8_t b) __attribute__((always_inline)) { SPDR=b; shouldWait(true); wait(); }
+	static void writeByteNoWait(uint8_t b) __attribute__((always_inline)) { SPDR=b; shouldWait(true); }
+
+	template <uint8_t BIT> inline static void writeBit(uint8_t b) {
+		SPCR &= ~(1 << SPE);
+		if(b & (1 << BIT)) {
+			FastPin<_DATA_PIN>::hi();
+		} else {
+			FastPin<_DATA_PIN>::lo();
+		}
+
+		FastPin<_CLOCK_PIN>::hi();
+		FastPin<_CLOCK_PIN>::lo();
+		SPCR |= 1 << SPE;
+		shouldWait(false);
+	}
+
+	void enable_pins() {
+		SPCR |= ((1<<SPE) | (1<<MSTR) ); 		// enable SPI as master
+	}
+
+	void disable_pins() {
+		SPCR &= ~(((1<<SPE) | (1<<MSTR) )); // disable SPI
+	}
+
+	void select() {
+		if(m_pSelect != NULL) { m_pSelect->select(); }
+		enable_pins();
+		setSPIRate();
+	}
+
+	void release() {
+		if(m_pSelect != NULL) { m_pSelect->release(); }
+		disable_pins();
+	}
+
+	static void writeBytesValueRaw(uint8_t value, int len) {
+		while(len--) { writeByte(value); }
+	}
+
+	void writeBytesValue(uint8_t value, int len) {
+		//setSPIRate();
+		select();
+		while(len--) {
+			writeByte(value);
+		}
+		release();
+	}
+
+	// Write a block of n uint8_ts out
+	template <class D> void writeBytes(register uint8_t *data, int len) {
+		//setSPIRate();
+		uint8_t *end = data + len;
+		select();
+		while(data != end) {
+			// a slight touch of delay here helps optimize the timing of the status register check loop (not used on ARM)
+			writeByte(D::adjust(*data++)); delaycycles<3>();
+		}
+		release();
+	}
+
+	void writeBytes(register uint8_t *data, int len) { writeBytes<DATA_NOP>(data, len); }
+
+	// write a block of uint8_ts out in groups of three.  len is the total number of uint8_ts to write out.  The template
+	// parameters indicate how many uint8_ts to skip at the beginning and/or end of each grouping
+	template <uint8_t FLAGS, class D, EOrder RGB_ORDER> void writePixels(PixelController<RGB_ORDER> pixels) {
+		//setSPIRate();
+		int len = pixels.mLen;
+
+		select();
+		while(pixels.has(1)) {
+			if(FLAGS & FLAG_START_BIT) {
+				writeBit<0>(1);
+				writeBytePostWait(D::adjust(pixels.loadAndScale0()));
+				writeBytePostWait(D::adjust(pixels.loadAndScale1()));
+				writeBytePostWait(D::adjust(pixels.loadAndScale2()));
+			} else {
+				writeByte(D::adjust(pixels.loadAndScale0()));
+				writeByte(D::adjust(pixels.loadAndScale1()));
+				writeByte(D::adjust(pixels.loadAndScale2()));
+			}
+
+			pixels.advanceData();
+			pixels.stepDithering();
+		}
+		D::postBlock(len);
+		waitFully();
+		release();
+	}
+};
+#elif defined(SPSR0)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Hardware SPI support using SPDR0 registers and friends
+//
+// Technically speaking, this uses the AVR SPI registers.  This will work on the Teensy 3.0 because Paul made a set of compatability
+// classes that map the AVR SPI registers to ARM's, however this caps the performance of output.
+//
+// TODO: implement ARMHardwareSPIOutput
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <uint8_t _DATA_PIN, uint8_t _CLOCK_PIN, uint32_t _SPI_CLOCK_DIVIDER>
+class AVRHardwareSPIOutput {
+	Selectable *m_pSelect;
+	bool mWait;
+public:
+	AVRHardwareSPIOutput() { m_pSelect = NULL; mWait = false;}
+	AVRHardwareSPIOutput(Selectable *pSelect) { m_pSelect = pSelect; }
+	void setSelect(Selectable *pSelect) { m_pSelect = pSelect; }
+
+	void setSPIRate() {
+		SPCR0 &= ~ ( (1<<SPR10) | (1<<SPR0) ); 	// clear out the prescalar bits
+
+	    bool b2x = false;
+
+	    if(_SPI_CLOCK_DIVIDER >= 128) { SPCR0 |= (1<<SPR1); SPCR0 |= (1<<SPR0); }
+	    else if(_SPI_CLOCK_DIVIDER >= 64) { SPCR0 |= (1<<SPR1);}
+	    else if(_SPI_CLOCK_DIVIDER >= 32) { SPCR0 |= (1<<SPR1); b2x = true;  }
+	    else if(_SPI_CLOCK_DIVIDER >= 16) { SPCR0 |= (1<<SPR0); }
+	    else if(_SPI_CLOCK_DIVIDER >= 8) { SPCR0 |= (1<<SPR0); b2x = true; }
+	    else if(_SPI_CLOCK_DIVIDER >= 4) { /* do nothing - default rate */ }
+	    else { b2x = true; }
+
+	    if(b2x) { SPSR0 |= (1<<SPI2X); }
+	    else { SPSR0 &= ~ (1<<SPI2X); }
+	}
+
+	void init() {
+		volatile uint8_t clr;
+
+		// set the pins to output
+		FastPin<_DATA_PIN>::setOutput();
+		FastPin<_CLOCK_PIN>::setOutput();
+#ifdef SPI_SELECT
+		// Make sure the slave select line is set to output, or arduino will block us
+		FastPin<SPI_SELECT>::setOutput();
+		FastPin<SPI_SELECT>::lo();
+#endif
+
+		SPCR0 |= ((1<<SPE) | (1<<MSTR) ); 		// enable SPI as master
+		SPCR0 &= ~ ( (1<<SPR1) | (1<<SPR0) ); 	// clear out the prescalar bits
+
+		clr = SPSR0; // clear SPI status register
+		clr = SPDR0; // clear SPI data register
+		clr;
+
+	    bool b2x = false;
+
+	    if(_SPI_CLOCK_DIVIDER >= 128) { SPCR0 |= (1<<SPR1); SPCR0 |= (1<<SPR0); }
+	    else if(_SPI_CLOCK_DIVIDER >= 64) { SPCR0 |= (1<<SPR1);}
+	    else if(_SPI_CLOCK_DIVIDER >= 32) { SPCR0 |= (1<<SPR1); b2x = true;  }
+	    else if(_SPI_CLOCK_DIVIDER >= 16) { SPCR0 |= (1<<SPR0); }
+	    else if(_SPI_CLOCK_DIVIDER >= 8) { SPCR0 |= (1<<SPR0); b2x = true; }
+	    else if(_SPI_CLOCK_DIVIDER >= 4) { /* do nothing - default rate */ }
+	    else { b2x = true; }
+
+	    if(b2x) { SPSR0 |= (1<<SPI2X); }
+	    else { SPSR0 &= ~ (1<<SPI2X); }
+
+	    SPDR0=0;
+	    shouldWait(false);
+			release();
+		}
+
+	static bool shouldWait(bool wait = false) __attribute__((always_inline)) {
+		static bool sWait=false;
+		if(sWait) { sWait = wait; return true; } else { sWait = wait; return false; }
+		// return true;
+	}
+	static void wait() __attribute__((always_inline)) { if(shouldWait()) { while(!(SPSR0 & (1<<SPIF))); } }
+	static void waitFully() __attribute__((always_inline)) { wait(); }
+
+	static void writeWord(uint16_t w) __attribute__((always_inline)) { writeByte(w>>8); writeByte(w&0xFF); }
+
+	static void writeByte(uint8_t b) __attribute__((always_inline)) { wait(); SPDR0=b;  shouldWait(true); }
+	static void writeBytePostWait(uint8_t b) __attribute__((always_inline)) { SPDR0=b; shouldWait(true); wait(); }
+	static void writeByteNoWait(uint8_t b) __attribute__((always_inline)) { SPDR0=b; shouldWait(true); }
+
+	template <uint8_t BIT> inline static void writeBit(uint8_t b) {
+		SPCR0 &= ~(1 << SPE);
+		if(b & (1 << BIT)) {
+			FastPin<_DATA_PIN>::hi();
+		} else {
+			FastPin<_DATA_PIN>::lo();
+		}
+
+		FastPin<_CLOCK_PIN>::hi();
+		FastPin<_CLOCK_PIN>::lo();
+		SPCR0 |= 1 << SPE;
+		shouldWait(false);
+	}
+
+	void enable_pins() {
+		SPCR0 |= ((1<<SPE) | (1<<MSTR) ); 		// enable SPI as master
+	}
+
+	void disable_pins() {
+		SPCR0 &= ~(((1<<SPE) | (1<<MSTR) )); // disable SPI
+	}
+
+	void select() {
+		if(m_pSelect != NULL) { m_pSelect->select(); }
+		enable_pins();
+		setSPIRate();
+	}
+
+	void release() {
+		if(m_pSelect != NULL) { m_pSelect->release(); }
+		disable_pins();
+	}
+
+	static void writeBytesValueRaw(uint8_t value, int len) {
+		while(len--) { writeByte(value); }
+	}
+
+	void writeBytesValue(uint8_t value, int len) {
+		//setSPIRate();
+		select();
+		while(len--) {
+			writeByte(value);
+		}
+		release();
+	}
+
+	// Write a block of n uint8_ts out
+	template <class D> void writeBytes(register uint8_t *data, int len) {
+		//setSPIRate();
+		uint8_t *end = data + len;
+		select();
+		while(data != end) {
+			// a slight touch of delay here helps optimize the timing of the status register check loop (not used on ARM)
+			writeByte(D::adjust(*data++)); delaycycles<3>();
+		}
+		release();
+	}
+
+	void writeBytes(register uint8_t *data, int len) { writeBytes<DATA_NOP>(data, len); }
+
+	// write a block of uint8_ts out in groups of three.  len is the total number of uint8_ts to write out.  The template
+	// parameters indicate how many uint8_ts to skip at the beginning and/or end of each grouping
+	template <uint8_t FLAGS, class D, EOrder RGB_ORDER> void writePixels(PixelController<RGB_ORDER> pixels) {
+		//setSPIRate();
+		int len = pixels.mLen;
+
+		select();
+		while(pixels.has(1)) {
+			if(FLAGS & FLAG_START_BIT) {
+				writeBit<0>(1);
+				writeBytePostWait(D::adjust(pixels.loadAndScale0()));
+				writeBytePostWait(D::adjust(pixels.loadAndScale1()));
+				writeBytePostWait(D::adjust(pixels.loadAndScale2()));
+			} else {
+				writeByte(D::adjust(pixels.loadAndScale0()));
+				writeByte(D::adjust(pixels.loadAndScale1()));
+				writeByte(D::adjust(pixels.loadAndScale2()));
+			}
+
+			pixels.advanceData();
+			pixels.stepDithering();
+		}
+		D::postBlock(len);
+		waitFully();
+		release();
+	}
+};
+#endif
+
+#else
+// #define FASTLED_FORCE_SOFTWARE_SPI
+#endif
+
+FASTLED_NAMESPACE_END;
+
+
+#endif
@@ -0,0 +1,67 @@
+#ifndef __INC_LED_SYSDEFS_AVR_H
+#define __INC_LED_SYSDEFS_AVR_H
+
+#define FASTLED_AVR
+
+#ifndef INTERRUPT_THRESHOLD
+#define INTERRUPT_THRESHOLD 2
+#endif
+
+#define FASTLED_SPI_BYTE_ONLY
+
+#include <avr/io.h>
+#include <avr/interrupt.h> // for cli/se definitions
+
+// Define the register types
+typedef volatile       uint8_t RoReg; /**< Read only 8-bit register (volatile const unsigned int) */
+typedef volatile       uint8_t RwReg; /**< Read-Write 8-bit register (volatile unsigned int) */
+
+
+// Default to disallowing interrupts (may want to gate this on teensy2 vs. other arm platforms, since the
+// teensy2 has a good, fast millis interrupt implementation)
+#ifndef FASTLED_ALLOW_INTERRUPTS
+#define FASTLED_ALLOW_INTERRUPTS 0
+#endif
+
+#if FASTLED_ALLOW_INTERRUPTS == 1
+#define FASTLED_ACCURATE_CLOCK
+#endif
+
+
+// Default to using PROGMEM here
+#ifndef FASTLED_USE_PROGMEM
+#define FASTLED_USE_PROGMEM 1
+#endif
+
+#if defined(ARDUINO_AVR_DIGISPARK) || defined(ARDUINO_AVR_DIGISPARKPRO)
+#ifndef NO_CORRECTION
+#define NO_CORRECTION 1
+#endif
+#endif
+
+extern "C" {
+#  if defined(CORE_TEENSY) || defined(TEENSYDUINO)
+extern volatile unsigned long timer0_millis_count;
+#    define MS_COUNTER timer0_millis_count
+#  elif defined(ATTINY_CORE)
+extern volatile unsigned long millis_timer_millis;
+#    define MS_COUNTER millis_timer_millis
+#  else
+extern volatile unsigned long timer0_millis;
+#    define MS_COUNTER timer0_millis
+#  endif
+};
+
+// special defs for the tiny environments
+#if defined(__AVR_ATmega32U2__) || defined(__AVR_ATmega16U2__) || defined(__AVR_ATmega8U2__) || defined(__AVR_AT90USB162__) || defined(__AVR_ATtiny24__) || defined(__AVR_ATtiny44__) || defined(__AVR_ATtiny84__) || defined(__AVR_ATtiny25__) || defined(__AVR_ATtiny45__) || defined(__AVR_ATtiny85__) || defined(__AVR_ATtiny167__) || defined(__AVR_ATtiny87__) || defined(__AVR_ATtinyX41__) || defined(__AVR_ATtiny841__) || defined(__AVR_ATtiny441__)
+#define LIB8_ATTINY 1
+#define FASTLED_NEEDS_YIELD
+#endif
+
+#if defined(ARDUINO) && (ARDUINO > 150) && !defined(IS_BEAN) && !defined (ARDUINO_AVR_DIGISPARK) && !defined (LIB8_TINY) && !defined (ARDUINO_AVR_LARDU_328E)
+// don't need YIELD defined by the library 
+#else 
+#define FASTLED_NEEDS_YIELD
+extern "C" void yield();
+#endif
+#endif