strip-controller-esp8266/.pio/libdeps/local/FastLED/platforms/arm/common/m0clockless.h
2020-07-17 18:55:06 +02:00

390 lines
14 KiB
C++

#ifndef __INC_M0_CLOCKLESS_H
#define __INC_M0_CLOCKLESS_H
struct M0ClocklessData {
uint8_t d[3];
uint8_t e[3];
uint8_t adj;
uint8_t pad;
uint32_t s[3];
};
template<int HI_OFFSET, int LO_OFFSET, int T1, int T2, int T3, EOrder RGB_ORDER, int WAIT_TIME>int
showLedData(volatile uint32_t *_port, uint32_t _bitmask, const uint8_t *_leds, uint32_t num_leds, struct M0ClocklessData *pData) {
// Lo register variables
register uint32_t scratch=0;
register struct M0ClocklessData *base = pData;
register volatile uint32_t *port = _port;
register uint32_t d=0;
register uint32_t counter=num_leds;
register uint32_t bn=0;
register uint32_t b=0;
register uint32_t bitmask = _bitmask;
// high register variable
register const uint8_t *leds = _leds;
#if (FASTLED_SCALE8_FIXED == 1)
pData->s[0]++;
pData->s[1]++;
pData->s[2]++;
#endif
asm __volatile__ (
///////////////////////////////////////////////////////////////////////////
//
// asm macro definitions - used to assemble the clockless output
//
".ifnotdef fl_delay_def;"
#ifdef FASTLED_ARM_M0_PLUS
" .set fl_is_m0p, 1;"
" .macro m0pad;"
" nop;"
" .endm;"
#else
" .set fl_is_m0p, 0;"
" .macro m0pad;"
" .endm;"
#endif
" .set fl_delay_def, 1;"
" .set fl_delay_mod, 4;"
" .if fl_is_m0p == 1;"
" .set fl_delay_mod, 3;"
" .endif;"
" .macro fl_delay dtime, reg=r0;"
" .if (\\dtime > 0);"
" .set dcycle, (\\dtime / fl_delay_mod);"
" .set dwork, (dcycle * fl_delay_mod);"
" .set drem, (\\dtime - dwork);"
" .rept (drem);"
" nop;"
" .endr;"
" .if dcycle > 0;"
" mov \\reg, #dcycle;"
" delayloop_\\@:;"
" sub \\reg, #1;"
" bne delayloop_\\@;"
" .if fl_is_m0p == 0;"
" nop;"
" .endif;"
" .endif;"
" .endif;"
" .endm;"
" .macro mod_delay dtime,b1,b2,reg;"
" .set adj, (\\b1 + \\b2);"
" .if adj < \\dtime;"
" .set dtime2, (\\dtime - adj);"
" fl_delay dtime2, \\reg;"
" .endif;"
" .endm;"
// check the bit and drop the line low if it isn't set
" .macro qlo4 b,bitmask,port,loff ;"
" lsl \\b, #1 ;"
" bcs skip_\\@ ;"
" str \\bitmask, [\\port, \\loff] ;"
" skip_\\@: ;"
" m0pad;"
" .endm ;"
// set the pin hi or low (determined by the offset passed in )
" .macro qset2 bitmask,port,loff;"
" str \\bitmask, [\\port, \\loff];"
" m0pad;"
" .endm;"
// Load up the next led byte to work with, put it in bn
" .macro loadleds3 leds, bn, rled, scratch;"
" mov \\scratch, \\leds;"
" ldrb \\bn, [\\scratch, \\rled];"
" .endm;"
// check whether or not we should dither
" .macro loaddither7 bn,d,base,rdither;"
" ldrb \\d, [\\base, \\rdither];"
" lsl \\d, #24;" //; shift high for the qadd w/bn
" lsl \\bn, #24;" //; shift high for the qadd w/d
" bne chkskip_\\@;" //; if bn==0, clear d;"
" eor \\d, \\d;" //; clear d;"
" m0pad;"
" chkskip_\\@:;"
" .endm;"
// Do the qadd8 for dithering -- there's two versions of this. The m0 version
// takes advantage of the 3 cycle branch to do two things after the branch,
// while keeping timing constant. The m0+, however, branches in 2 cycles, so
// we have to work around that a bit more. This is one of the few times
// where the m0 will actually be _more_ efficient than the m0+
" .macro dither5 bn,d;"
" .syntax unified;"
" .if fl_is_m0p == 0;"
" adds \\bn, \\d;" // do the add
" bcc dither5_1_\\@;"
" mvns \\bn, \\bn;" // set the low 24bits ot 1's
" lsls \\bn, \\bn, #24;" // move low 8 bits to the high bits
" dither5_1_\\@:;"
" nop;" // nop to keep timing in line
" .else;"
" adds \\bn, \\d;" // do the add"
" bcc dither5_2_\\@;"
" mvns \\bn, \\bn;" // set the low 24bits ot 1's
" dither5_2_\\@:;"
" bcc dither5_3_\\@;"
" lsls \\bn, \\bn, #24;" // move low 8 bits to the high bits
" dither5_3_\\@:;"
" .endif;"
" .syntax divided;"
" .endm;"
// Do our scaling
" .macro scale4 bn, base, scale, scratch;"
" ldr \\scratch, [\\base, \\scale];"
" lsr \\bn, \\bn, #24;" // bring bn back down to its low 8 bits
" mul \\bn, \\scratch;" // do the multiply
" .endm;"
// swap bn into b
" .macro swapbbn1 b,bn;"
" lsl \\b, \\bn, #16;" // put the 8 bits we want for output high
" .endm;"
// adjust the dithering value for the next time around (load e from memory
// to do the math)
" .macro adjdither7 base,d,rled,eoffset,scratch;"
" ldrb \\d, [\\base, \\rled];"
" ldrb \\scratch,[\\base,\\eoffset];" // load e
" .syntax unified;"
" subs \\d, \\scratch, \\d;" // d=e-d
" .syntax divided;"
" strb \\d, [\\base, \\rled];" // save d
" .endm;"
// increment the led pointer (base+6 has what we're incrementing by)
" .macro incleds3 leds, base, scratch;"
" ldrb \\scratch, [\\base, #6];" // load incremen
" add \\leds, \\leds, \\scratch;" // update leds pointer
" .endm;"
// compare and loop
" .macro cmploop5 counter,label;"
" .syntax unified;"
" subs \\counter, #1;"
" .syntax divided;"
" beq done_\\@;"
" m0pad;"
" b \\label;"
" done_\\@:;"
" .endm;"
" .endif;"
);
#define M0_ASM_ARGS : \
[leds] "+h" (leds), \
[counter] "+l" (counter), \
[scratch] "+l" (scratch), \
[d] "+l" (d), \
[bn] "+l" (bn), \
[b] "+l" (b) \
: \
[port] "l" (port), \
[base] "l" (base), \
[bitmask] "l" (bitmask), \
[hi_off] "I" (HI_OFFSET), \
[lo_off] "I" (LO_OFFSET), \
[led0] "I" (RO(0)), \
[led1] "I" (RO(1)), \
[led2] "I" (RO(2)), \
[e0] "I" (3+RO(0)), \
[e1] "I" (3+RO(1)), \
[e2] "I" (3+RO(2)), \
[scale0] "I" (4*(2+RO(0))), \
[scale1] "I" (4*(2+RO(1))), \
[scale2] "I" (4*(2+RO(2))), \
[T1] "I" (T1), \
[T2] "I" (T2), \
[T3] "I" (T3) \
:
/////////////////////////////////////////////////////////////////////////
// now for some convinience macros to make building our lines a bit cleaner
#define LOOP " loop_%=:"
#define HI2 " qset2 %[bitmask], %[port], %[hi_off];"
#define _D1 " mod_delay %c[T1],2,0,%[scratch];"
#define QLO4 " qlo4 %[b],%[bitmask],%[port], %[lo_off];"
#define LOADLEDS3(X) " loadleds3 %[leds], %[bn], %[led" #X "] ,%[scratch];"
#define _D2(ADJ) " mod_delay %c[T2],4," #ADJ ",%[scratch];"
#define LO2 " qset2 %[bitmask], %[port], %[lo_off];"
#define _D3(ADJ) " mod_delay %c[T3],2," #ADJ ",%[scratch];"
#define LOADDITHER7(X) " loaddither7 %[bn], %[d], %[base], %[led" #X "];"
#define DITHER5 " dither5 %[bn], %[d];"
#define SCALE4(X) " scale4 %[bn], %[base], %[scale" #X "], %[scratch];"
#define SWAPBBN1 " swapbbn1 %[b], %[bn];"
#define ADJDITHER7(X) " adjdither7 %[base],%[d],%[led" #X "],%[e" #X "],%[scratch];"
#define INCLEDS3 " incleds3 %[leds],%[base],%[scratch];"
#define CMPLOOP5 " cmploop5 %[counter], loop_%=;"
#define NOTHING ""
#if (defined(SEI_CHK) && (FASTLED_ALLOW_INTERRUPTS == 1))
// We're allowing interrupts and have hardware timer support defined -
// track the loop outside the asm code, to allow inserting the interrupt
// overrun checks.
asm __volatile__ (
// pre-load byte 0
LOADLEDS3(0) LOADDITHER7(0) DITHER5 SCALE4(0) ADJDITHER7(0) SWAPBBN1
M0_ASM_ARGS);
do {
asm __volatile__ (
// Write out byte 0, prepping byte 1
HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
HI2 _D1 QLO4 LOADLEDS3(1) _D2(3) LO2 _D3(0)
HI2 _D1 QLO4 LOADDITHER7(1) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
HI2 _D1 QLO4 SCALE4(1) _D2(4) LO2 _D3(0)
HI2 _D1 QLO4 ADJDITHER7(1) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(0)
// Write out byte 1, prepping byte 2
HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
HI2 _D1 QLO4 LOADLEDS3(2) _D2(3) LO2 _D3(0)
HI2 _D1 QLO4 LOADDITHER7(2) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
HI2 _D1 QLO4 SCALE4(2) _D2(4) LO2 _D3(0)
HI2 _D1 QLO4 ADJDITHER7(2) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(0)
// Write out byte 2, prepping byte 0
HI2 _D1 QLO4 INCLEDS3 _D2(3) LO2 _D3(0)
HI2 _D1 QLO4 LOADLEDS3(0) _D2(3) LO2 _D3(0)
HI2 _D1 QLO4 LOADDITHER7(0) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
HI2 _D1 QLO4 SCALE4(0) _D2(4) LO2 _D3(0)
HI2 _D1 QLO4 ADJDITHER7(0) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(5)
M0_ASM_ARGS
);
SEI_CHK; INNER_SEI; --counter; CLI_CHK;
} while(counter);
#elif (FASTLED_ALLOW_INTERRUPTS == 1)
// We're allowing interrupts - track the loop outside the asm code, and
// re-enable interrupts in between each iteration.
asm __volatile__ (
// pre-load byte 0
LOADLEDS3(0) LOADDITHER7(0) DITHER5 SCALE4(0) ADJDITHER7(0) SWAPBBN1
M0_ASM_ARGS);
do {
asm __volatile__ (
// Write out byte 0, prepping byte 1
HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
HI2 _D1 QLO4 LOADLEDS3(1) _D2(3) LO2 _D3(0)
HI2 _D1 QLO4 LOADDITHER7(1) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
HI2 _D1 QLO4 SCALE4(1) _D2(4) LO2 _D3(0)
HI2 _D1 QLO4 ADJDITHER7(1) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(0)
// Write out byte 1, prepping byte 2
HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
HI2 _D1 QLO4 LOADLEDS3(2) _D2(3) LO2 _D3(0)
HI2 _D1 QLO4 LOADDITHER7(2) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
HI2 _D1 QLO4 SCALE4(2) _D2(4) LO2 _D3(0)
HI2 _D1 QLO4 ADJDITHER7(2) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 INCLEDS3 _D2(3) LO2 _D3(0)
HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(0)
// Write out byte 2, prepping byte 0
HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
HI2 _D1 QLO4 LOADLEDS3(0) _D2(3) LO2 _D3(0)
HI2 _D1 QLO4 LOADDITHER7(0) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
HI2 _D1 QLO4 SCALE4(0) _D2(4) LO2 _D3(0)
HI2 _D1 QLO4 ADJDITHER7(0) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(5)
M0_ASM_ARGS
);
uint32_t ticksBeforeInterrupts = SysTick->VAL;
sei();
--counter;
cli();
// If more than 45 uSecs have elapsed, give up on this frame and start over.
// Note: this isn't completely correct. It's possible that more than one
// millisecond will elapse, and so SysTick->VAL will lap
// ticksBeforeInterrupts.
// Note: ticksBeforeInterrupts DECREASES
const uint32_t kTicksPerMs = VARIANT_MCK / 1000;
const uint32_t kTicksPerUs = kTicksPerMs / 1000;
const uint32_t kTicksIn45us = kTicksPerUs * 45;
const uint32_t currentTicks = SysTick->VAL;
if (ticksBeforeInterrupts < currentTicks) {
// Timer started over
if ((ticksBeforeInterrupts + (kTicksPerMs - currentTicks)) > kTicksIn45us) {
return 0;
}
} else {
if ((ticksBeforeInterrupts - currentTicks) > kTicksIn45us) {
return 0;
}
}
} while(counter);
#else
// We're not allowing interrupts - run the entire loop in asm to keep things
// as tight as possible. In an ideal world, we should be pushing out ws281x
// leds (or other 3-wire leds) with zero gaps between pixels.
asm __volatile__ (
// pre-load byte 0
LOADLEDS3(0) LOADDITHER7(0) DITHER5 SCALE4(0) ADJDITHER7(0) SWAPBBN1
// loop over writing out the data
LOOP
// Write out byte 0, prepping byte 1
HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
HI2 _D1 QLO4 LOADLEDS3(1) _D2(3) LO2 _D3(0)
HI2 _D1 QLO4 LOADDITHER7(1) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
HI2 _D1 QLO4 SCALE4(1) _D2(4) LO2 _D3(0)
HI2 _D1 QLO4 ADJDITHER7(1) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(0)
// Write out byte 1, prepping byte 2
HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
HI2 _D1 QLO4 LOADLEDS3(2) _D2(3) LO2 _D3(0)
HI2 _D1 QLO4 LOADDITHER7(2) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
HI2 _D1 QLO4 SCALE4(2) _D2(4) LO2 _D3(0)
HI2 _D1 QLO4 ADJDITHER7(2) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 INCLEDS3 _D2(3) LO2 _D3(0)
HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(0)
// Write out byte 2, prepping byte 0
HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
HI2 _D1 QLO4 LOADLEDS3(0) _D2(3) LO2 _D3(0)
HI2 _D1 QLO4 LOADDITHER7(0) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 DITHER5 _D2(5) LO2 _D3(0)
HI2 _D1 QLO4 SCALE4(0) _D2(4) LO2 _D3(0)
HI2 _D1 QLO4 ADJDITHER7(0) _D2(7) LO2 _D3(0)
HI2 _D1 QLO4 NOTHING _D2(0) LO2 _D3(0)
HI2 _D1 QLO4 SWAPBBN1 _D2(1) LO2 _D3(5) CMPLOOP5
M0_ASM_ARGS
);
#endif
return num_leds;
}
#endif